pythonflex 0.1.6__py3-none-any.whl → 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pythonflex/__init__.py +2 -2
- pythonflex/analysis.py +101 -27
- pythonflex/examples/basic_usage.py +2 -3
- pythonflex/examples/test.py +104 -0
- pythonflex/plotting.py +412 -252
- {pythonflex-0.1.6.dist-info → pythonflex-0.2.dist-info}/METADATA +1 -1
- {pythonflex-0.1.6.dist-info → pythonflex-0.2.dist-info}/RECORD +9 -8
- {pythonflex-0.1.6.dist-info → pythonflex-0.2.dist-info}/WHEEL +0 -0
- {pythonflex-0.1.6.dist-info → pythonflex-0.2.dist-info}/entry_points.txt +0 -0
pythonflex/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
from .logging_config import log
|
|
3
3
|
from .utils import dsave, dload
|
|
4
4
|
from .preprocessing import get_example_data_path, load_datasets, get_common_genes, filter_matrix_by_genes, load_gold_standard, filter_duplicate_terms
|
|
5
|
-
from .analysis import initialize, pra, pra_percomplex, fast_corr, perform_corr, is_symmetric, binary, has_mirror_of_first_pair, convert_full_to_half_matrix, drop_mirror_pairs, quick_sort, complex_contributions, save_results_to_csv
|
|
5
|
+
from .analysis import initialize, pra, pra_percomplex, fast_corr, perform_corr, is_symmetric, binary, has_mirror_of_first_pair, convert_full_to_half_matrix, drop_mirror_pairs, quick_sort, complex_contributions, save_results_to_csv, update_matploblib_config
|
|
6
6
|
from .plotting import (
|
|
7
7
|
adjust_text_positions, plot_precision_recall_curve, plot_percomplex_scatter,
|
|
8
8
|
plot_percomplex_scatter_bysize, plot_complex_contributions, plot_significant_complexes, plot_auc_scores
|
|
@@ -14,5 +14,5 @@ __all__ = [ "log", "get_example_data_path", "fast_corr",
|
|
|
14
14
|
"perform_corr", "is_symmetric", "binary", "has_mirror_of_first_pair", "convert_full_to_half_matrix",
|
|
15
15
|
"drop_mirror_pairs", "quick_sort", "complex_contributions", "adjust_text_positions", "plot_precision_recall_curve",
|
|
16
16
|
"plot_percomplex_scatter", "plot_percomplex_scatter_bysize", "plot_complex_contributions",
|
|
17
|
-
"plot_significant_complexes", "plot_auc_scores", "save_results_to_csv"
|
|
17
|
+
"plot_significant_complexes", "plot_auc_scores", "save_results_to_csv", "update_matploblib_config"
|
|
18
18
|
]
|
pythonflex/analysis.py
CHANGED
|
@@ -23,7 +23,7 @@ from .logging_config import log
|
|
|
23
23
|
from .preprocessing import filter_matrix_by_genes
|
|
24
24
|
from .utils import dsave, dload, _sanitize
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
import matplotlib as mpl
|
|
27
27
|
|
|
28
28
|
def deep_update(source, overrides):
|
|
29
29
|
"""Recursively update the source dict with the overrides."""
|
|
@@ -40,7 +40,7 @@ def initialize(config={}):
|
|
|
40
40
|
|
|
41
41
|
default_config = {
|
|
42
42
|
"min_genes_in_complex": 3,
|
|
43
|
-
"min_genes_per_complex_analysis":
|
|
43
|
+
"min_genes_per_complex_analysis": 2,
|
|
44
44
|
"output_folder": "output",
|
|
45
45
|
"gold_standard": "CORUM",
|
|
46
46
|
"color_map": "RdYlBu",
|
|
@@ -48,7 +48,7 @@ def initialize(config={}):
|
|
|
48
48
|
"plotting": {
|
|
49
49
|
"save_plot": True,
|
|
50
50
|
"show_plot": True,
|
|
51
|
-
"output_type": "
|
|
51
|
+
"output_type": "pdf",
|
|
52
52
|
},
|
|
53
53
|
"preprocessing": {
|
|
54
54
|
"normalize": False,
|
|
@@ -95,31 +95,105 @@ def initialize(config={}):
|
|
|
95
95
|
|
|
96
96
|
|
|
97
97
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
'
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def update_matploblib_config(config=None, font_family="Arial", layout="single"):
|
|
101
|
+
"""
|
|
102
|
+
Configure matplotlib settings optimized for Nature journal figures:
|
|
103
|
+
- 7 pt fonts (labels, ticks, legend), 9 pt titles
|
|
104
|
+
- Thin spines (0.5 pt), ticks out (left/bottom only), no minor ticks
|
|
105
|
+
- No grid, clean minimalist look
|
|
106
|
+
- Colorblind-friendly Tableau 10 color cycle
|
|
107
|
+
- Illustrator-safe PDF export (Type 42)
|
|
108
|
+
- Figure sizes: "single" (~89 mm), "double" (~183 mm), or custom (width, height) in inches
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
config (dict, optional): Configuration dict (e.g., {'color_map': 'RdYlBu'}).
|
|
112
|
+
font_family (str): Preferred font (e.g., 'Arial', falls back to 'Helvetica').
|
|
113
|
+
layout (str or tuple): 'single' (~89 mm), 'double' (~183 mm), or (width, height) in inches.
|
|
114
|
+
"""
|
|
115
|
+
if config is None:
|
|
116
|
+
config = {}
|
|
117
|
+
# Fallback if chosen font missing
|
|
118
|
+
try:
|
|
119
|
+
from matplotlib.font_manager import findfont, FontProperties
|
|
120
|
+
findfont(FontProperties(family=font_family))
|
|
121
|
+
except Exception:
|
|
122
|
+
font_family = "Helvetica" # Nature prefers Helvetica if Arial unavailable
|
|
123
|
+
print(f"Warning: '{font_family}' not found, falling back to 'Helvetica'.")
|
|
124
|
+
|
|
125
|
+
# Figure size presets (Nature: single ≈ 89 mm, double ≈ 183 mm at 25.4 mm/inch)
|
|
126
|
+
if isinstance(layout, tuple):
|
|
127
|
+
fig_w, fig_h = layout
|
|
128
|
+
else:
|
|
129
|
+
if layout == "double":
|
|
130
|
+
fig_w = 7.2 # ~183 mm
|
|
131
|
+
fig_h = 5.4 # Adjusted aspect
|
|
132
|
+
else: # "single"
|
|
133
|
+
fig_w = 4.0 # Increased from 3.5" for more space (~102 mm)
|
|
134
|
+
fig_h = 3.0 # Increased from 2.6" for better aspect (~76 mm)
|
|
135
|
+
# Colorblind-friendly cycle (Tableau 10 adapted)
|
|
136
|
+
cb_cycle = [
|
|
137
|
+
"#4E79A7", "#F28E2B", "#E15759", "#76B7B2", "#59A14F",
|
|
138
|
+
"#EDC948", "#B07AA1", "#FF9DA7", "#9C755F", "#BAB0AC"
|
|
139
|
+
]
|
|
140
|
+
mpl.rcParams.update({
|
|
141
|
+
# --- Text & Fonts ---
|
|
142
|
+
"text.usetex": False, # Avoid LaTeX
|
|
143
|
+
"font.family": [font_family], # Explicit font
|
|
144
|
+
"mathtext.fontset": "dejavusans", # Disable mathtext
|
|
145
|
+
"mathtext.default": "regular", # Plain text
|
|
146
|
+
"axes.unicode_minus": True, # Proper minus signs
|
|
147
|
+
# --- Sizes (7 pt baseline, adjusted for space) ---
|
|
148
|
+
"font.size": 7, # Reduced from 8 pt
|
|
149
|
+
"axes.titlesize": 9, # Reduced from 10 pt
|
|
150
|
+
"axes.labelsize": 7,
|
|
151
|
+
"legend.fontsize": 7,
|
|
152
|
+
"xtick.labelsize": 7,
|
|
153
|
+
"ytick.labelsize": 7,
|
|
154
|
+
# --- Lines & Markers ---
|
|
155
|
+
"lines.linewidth": 1.5, # Kept for data visibility
|
|
156
|
+
"lines.markersize": 4.0,
|
|
157
|
+
"patch.linewidth": 0.5,
|
|
158
|
+
"errorbar.capsize": 2,
|
|
159
|
+
# --- Axes, Spines, Ticks ---
|
|
160
|
+
"axes.linewidth": 0.5,
|
|
161
|
+
"axes.edgecolor": "black",
|
|
162
|
+
"axes.facecolor": "none",
|
|
163
|
+
"axes.titlepad": 3.0,
|
|
164
|
+
"axes.labelpad": 2.0,
|
|
165
|
+
"axes.prop_cycle": mpl.cycler(color=cb_cycle),
|
|
166
|
+
"xtick.direction": "out",
|
|
167
|
+
"ytick.direction": "out",
|
|
168
|
+
"xtick.major.size": 2.5,
|
|
169
|
+
"ytick.major.size": 2.5,
|
|
170
|
+
"xtick.minor.visible": False,
|
|
171
|
+
"ytick.minor.visible": False,
|
|
172
|
+
"xtick.major.width": 0.5,
|
|
173
|
+
"ytick.major.width": 0.5,
|
|
174
|
+
"xtick.top": False,
|
|
175
|
+
"ytick.right": False,
|
|
176
|
+
# --- Grid ---
|
|
177
|
+
"axes.grid": False,
|
|
178
|
+
# --- Legend ---
|
|
179
|
+
"legend.frameon": False,
|
|
180
|
+
"legend.handlelength": 1.6, # Slightly adjusted
|
|
181
|
+
"legend.handletextpad": 0.4,
|
|
182
|
+
"legend.borderaxespad": 0.3,
|
|
183
|
+
"legend.loc": "best", # Dynamic placement to avoid overlap
|
|
184
|
+
# --- Figure & Save ---
|
|
185
|
+
"figure.dpi": 600,
|
|
186
|
+
"figure.figsize": (fig_w, fig_h),
|
|
187
|
+
"savefig.dpi": 600,
|
|
188
|
+
"savefig.bbox": "tight",
|
|
189
|
+
"savefig.pad_inches": 0.1, # Increased for spacing
|
|
190
|
+
"savefig.transparent": False, # White background
|
|
191
|
+
# --- PDF/SVG Export ---
|
|
192
|
+
"pdf.fonttype": 42,
|
|
193
|
+
"ps.fonttype": 42,
|
|
194
|
+
"pdf.use14corefonts": False,
|
|
195
|
+
"svg.fonttype": "none",
|
|
121
196
|
})
|
|
122
|
-
log.done("Matplotlib settings updated.")
|
|
123
197
|
|
|
124
198
|
|
|
125
199
|
|
|
@@ -22,7 +22,6 @@ inputs = {
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
#%%
|
|
25
|
-
|
|
26
25
|
default_config = {
|
|
27
26
|
"min_genes_in_complex": 0,
|
|
28
27
|
"min_genes_per_complex_analysis": 3,
|
|
@@ -32,7 +31,7 @@ default_config = {
|
|
|
32
31
|
"jaccard": True,
|
|
33
32
|
"plotting": {
|
|
34
33
|
"save_plot": True,
|
|
35
|
-
"output_type": "
|
|
34
|
+
"output_type": "pdf",
|
|
36
35
|
},
|
|
37
36
|
"preprocessing": {
|
|
38
37
|
"fill_na": True,
|
|
@@ -48,7 +47,6 @@ default_config = {
|
|
|
48
47
|
flex.initialize(default_config)
|
|
49
48
|
|
|
50
49
|
# Load datasets and gold standard terms
|
|
51
|
-
|
|
52
50
|
data, _ = flex.load_datasets(inputs)
|
|
53
51
|
terms, genes_in_terms = flex.load_gold_standard()
|
|
54
52
|
|
|
@@ -59,6 +57,7 @@ for name, dataset in data.items():
|
|
|
59
57
|
pra = flex.pra(name, dataset, is_corr=False)
|
|
60
58
|
fpc = flex.pra_percomplex(name, dataset, is_corr=False)
|
|
61
59
|
cc = flex.complex_contributions(name)
|
|
60
|
+
|
|
62
61
|
|
|
63
62
|
|
|
64
63
|
#%%
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
#%%
|
|
2
|
+
import pythonflex as flex
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# # Define specific cell line types you're interested in
|
|
6
|
+
DATA_DIR = "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/subset/"
|
|
7
|
+
|
|
8
|
+
# Specific cell lines of interest with "_cell_lines" suffix removed
|
|
9
|
+
cell_line_files = [
|
|
10
|
+
"soft_tissue_cell_lines.csv",
|
|
11
|
+
"skin_cell_lines.csv",
|
|
12
|
+
# "lung_cell_lines.csv",
|
|
13
|
+
# "head_and_neck_cell_lines.csv",
|
|
14
|
+
# "esophagus_stomach_cell_lines.csv",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
inputs = {}
|
|
18
|
+
|
|
19
|
+
# Create inputs dict with shortened names (removing "_cell_lines" suffix)
|
|
20
|
+
for filename in cell_line_files:
|
|
21
|
+
# Remove .csv extension and _cell_lines suffix
|
|
22
|
+
key = filename.replace("_cell_lines.csv", "")
|
|
23
|
+
full_path = os.path.join(DATA_DIR, filename)
|
|
24
|
+
|
|
25
|
+
inputs[key] = {
|
|
26
|
+
"path": full_path,
|
|
27
|
+
"sort": "high"
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
inputs['depmap'] = {
|
|
31
|
+
"path": "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv",
|
|
32
|
+
"sort": "high"
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Print the resulting inputs dictionary
|
|
36
|
+
print("Configured inputs:")
|
|
37
|
+
for key, value in inputs.items():
|
|
38
|
+
print(f" {key}: {value['path']}")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
default_config = {
|
|
43
|
+
"min_genes_in_complex": 2,
|
|
44
|
+
"min_genes_per_complex_analysis": 2,
|
|
45
|
+
"output_folder": "25q2_min_genes_2",
|
|
46
|
+
"gold_standard": "CORUM",
|
|
47
|
+
"color_map": "RdYlBu",
|
|
48
|
+
"jaccard": True,
|
|
49
|
+
"plotting": {
|
|
50
|
+
"save_plot": True,
|
|
51
|
+
"output_type": "pdf",
|
|
52
|
+
},
|
|
53
|
+
"preprocessing": {
|
|
54
|
+
"fill_na": True,
|
|
55
|
+
"normalize": False,
|
|
56
|
+
},
|
|
57
|
+
"corr_function": "numpy",
|
|
58
|
+
"logging": {
|
|
59
|
+
"visible_levels": ["DONE","STARTED"] # "PROGRESS", "STARTED", ,"INFO","WARNING"
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Initialize logger, config, and output folder
|
|
64
|
+
flex.initialize(default_config)
|
|
65
|
+
|
|
66
|
+
# Load datasets and gold standard terms
|
|
67
|
+
data, _ = flex.load_datasets(inputs)
|
|
68
|
+
terms, genes_in_terms = flex.load_gold_standard()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
#%%
|
|
72
|
+
# Run analysis
|
|
73
|
+
for name, dataset in data.items():
|
|
74
|
+
pra = flex.pra(name, dataset, is_corr=False)
|
|
75
|
+
fpc = flex.pra_percomplex(name, dataset, is_corr=False)
|
|
76
|
+
cc = flex.complex_contributions(name)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
#%%
|
|
81
|
+
# Generate plots
|
|
82
|
+
flex.plot_auc_scores()
|
|
83
|
+
flex.plot_precision_recall_curve()
|
|
84
|
+
flex.plot_percomplex_scatter()
|
|
85
|
+
flex.plot_percomplex_scatter_bysize()
|
|
86
|
+
flex.plot_significant_complexes()
|
|
87
|
+
flex.plot_complex_contributions()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
#%%
|
|
91
|
+
# Save results to CSV
|
|
92
|
+
flex.save_results_to_csv()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
#%%
|
|
103
|
+
|
|
104
|
+
|
pythonflex/plotting.py
CHANGED
|
@@ -8,302 +8,458 @@ import pandas as pd
|
|
|
8
8
|
import matplotlib.pyplot as plt
|
|
9
9
|
from matplotlib import patches
|
|
10
10
|
from matplotlib.cm import get_cmap
|
|
11
|
-
from
|
|
11
|
+
from matplotlib.ticker import NullFormatter, NullLocator
|
|
12
|
+
|
|
13
|
+
# Completely disable LaTeX and clear all font cache/references
|
|
14
|
+
import matplotlib as mpl
|
|
15
|
+
import matplotlib.font_manager as fm
|
|
16
|
+
|
|
17
|
+
# Disable LaTeX rendering completely
|
|
18
|
+
mpl.rcParams['text.usetex'] = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Reset all font-related parameters to system defaults
|
|
22
|
+
mpl.rcParams['font.family'] = 'sans-serif'
|
|
23
|
+
mpl.rcParams['font.serif'] = ['DejaVu Serif', 'Times New Roman', 'Bitstream Vera Serif', 'serif']
|
|
24
|
+
mpl.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial', 'Bitstream Vera Sans', 'sans-serif']
|
|
25
|
+
mpl.rcParams['font.cursive'] = ['Apple Chancery', 'Textile', 'Zapf Chancery', 'Sand', 'Script MT', 'Felipa', 'cursive']
|
|
26
|
+
mpl.rcParams['font.fantasy'] = ['Comic Sans MS', 'Chicago', 'Charcoal', 'Impact', 'Western', 'Humor Sans', 'fantasy']
|
|
27
|
+
mpl.rcParams['font.monospace'] = ['DejaVu Sans Mono', 'Bitstream Vera Sans Mono', 'Computer Modern Typewriter', 'Andale Mono', 'Nimbus Mono L', 'Courier New', 'Courier', 'Fixed', 'Terminal', 'monospace']
|
|
28
|
+
|
|
29
|
+
# Remove any LaTeX-specific math font settings
|
|
30
|
+
mpl.rcParams['mathtext.fontset'] = 'dejavusans'
|
|
31
|
+
mpl.rcParams['mathtext.default'] = 'regular'
|
|
32
|
+
|
|
33
|
+
# Force font manager to rebuild with system fonts only
|
|
34
|
+
try:
|
|
35
|
+
fm.fontManager.__init__()
|
|
36
|
+
except:
|
|
37
|
+
pass
|
|
38
|
+
|
|
12
39
|
|
|
13
40
|
# Local modules
|
|
14
41
|
from .utils import dload
|
|
15
42
|
from .logging_config import log
|
|
16
43
|
|
|
17
|
-
# Configuration
|
|
18
|
-
set_matplotlib_formats('svg', 'pdf')
|
|
19
44
|
|
|
20
45
|
|
|
21
46
|
|
|
22
47
|
|
|
23
|
-
def plot_precision_recall_curve():
|
|
48
|
+
def plot_precision_recall_curve(line_width=2.0, hide_minor_ticks=True):
|
|
24
49
|
pra = dload("pra")
|
|
25
50
|
config = dload("config")
|
|
26
51
|
plot_config = config["plotting"]
|
|
27
52
|
|
|
28
|
-
# Create figure using rcParams defaults (figsize and dpi are already set)
|
|
29
53
|
fig, ax = plt.subplots()
|
|
30
54
|
ax.set_xscale("log")
|
|
31
55
|
|
|
32
|
-
#
|
|
33
|
-
|
|
56
|
+
# optionally hide minor ticks on the log axis
|
|
57
|
+
if hide_minor_ticks:
|
|
58
|
+
ax.xaxis.set_minor_locator(NullLocator())
|
|
59
|
+
ax.xaxis.set_minor_formatter(NullFormatter())
|
|
60
|
+
|
|
61
|
+
cmap = get_cmap()
|
|
34
62
|
num_colors = len(pra) if isinstance(pra, dict) else 1
|
|
35
63
|
colors = [cmap(float(i) / max(num_colors - 1, 1)) for i in range(num_colors)]
|
|
36
64
|
|
|
37
65
|
if isinstance(pra, dict):
|
|
38
66
|
for (key, val), color in zip(pra.items(), colors):
|
|
39
67
|
val = val[val.tp > 10]
|
|
40
|
-
ax.plot(val.tp, val.precision, c=color, label=key, linewidth=
|
|
68
|
+
ax.plot(val.tp, val.precision, c=color, label=key, linewidth=line_width, alpha=0.9)
|
|
41
69
|
else:
|
|
42
70
|
pra = pra[pra.tp > 10]
|
|
43
|
-
ax.plot(pra.tp, pra.precision, c="black", label="Precision Recall Curve", linewidth=
|
|
71
|
+
ax.plot(pra.tp, pra.precision, c="black", label="Precision Recall Curve", linewidth=line_width, alpha=0.9)
|
|
44
72
|
|
|
45
|
-
# Labels and title (sizes handled by rcParams)
|
|
46
73
|
ax.set(title="Precision-Recall Performance of Datasets",
|
|
47
74
|
xlabel="Number of True Positives (TP)",
|
|
48
75
|
ylabel="Precision")
|
|
49
|
-
ax.legend(loc="upper right", frameon=
|
|
50
|
-
|
|
51
|
-
# Fix Y-axis to always go from 0 to 1
|
|
76
|
+
ax.legend(loc="upper right", frameon=False)
|
|
52
77
|
ax.set_ylim(0, 1)
|
|
53
78
|
|
|
54
|
-
#
|
|
55
|
-
ax.grid(
|
|
56
|
-
|
|
79
|
+
# Nature style: no grid, open top/right spines
|
|
80
|
+
ax.grid(False)
|
|
81
|
+
ax.spines['top'].set_visible(False)
|
|
82
|
+
ax.spines['right'].set_visible(False)
|
|
57
83
|
|
|
58
|
-
# Save handling (output config still needed)
|
|
59
84
|
if plot_config["save_plot"]:
|
|
60
85
|
output_type = plot_config["output_type"]
|
|
61
86
|
output_path = Path(config["output_folder"]) / f"precision_recall_curve.{output_type}"
|
|
62
|
-
fig.savefig(output_path, bbox_inches="tight", format=output_type)
|
|
87
|
+
fig.savefig(output_path, bbox_inches="tight", format=output_type)
|
|
63
88
|
|
|
64
89
|
if plot_config.get("show_plot", True):
|
|
65
90
|
plt.show()
|
|
66
|
-
|
|
67
91
|
plt.close(fig)
|
|
68
92
|
|
|
69
93
|
|
|
70
94
|
|
|
71
|
-
def plot_percomplex_scatter(n_top=10):
|
|
95
|
+
def plot_percomplex_scatter(n_top=10, sig_color='#B71A2A', nonsig_color='#DBDDDD', label_color='black', border_color='black', border_width=1.0):
|
|
72
96
|
config = dload("config")
|
|
73
97
|
plot_config = config["plotting"]
|
|
74
98
|
rdict = dload("pra_percomplex")
|
|
75
99
|
|
|
76
|
-
# Ensure there are at least two datasets to compare
|
|
77
100
|
if len(rdict) < 2:
|
|
78
101
|
print("Skipping plot: At least two datasets are required for per-complex scatter plot.")
|
|
79
102
|
return
|
|
80
|
-
|
|
103
|
+
|
|
81
104
|
column_pairs = list(combinations(rdict.keys(), 2))
|
|
82
105
|
df = pd.DataFrame()
|
|
83
|
-
|
|
84
|
-
# Data loading
|
|
106
|
+
|
|
85
107
|
for i, (key, val) in enumerate(rdict.items()):
|
|
86
108
|
val = val.rename(columns={"auc_score": key})
|
|
87
109
|
if i == 0:
|
|
88
|
-
df = val.copy().drop(columns=["Genes", "Length", "used_genes"])
|
|
110
|
+
df = val.copy().drop(columns=["Genes", "Length", "used_genes"], errors="ignore")
|
|
89
111
|
else:
|
|
90
112
|
df = pd.concat([df, val[key]], axis=1)
|
|
91
|
-
|
|
92
|
-
# Plotting
|
|
113
|
+
|
|
93
114
|
for pair in column_pairs:
|
|
94
115
|
extreme_indices_0 = df[pair[0]].sort_values(ascending=False).head(n_top).index
|
|
95
116
|
extreme_indices_1 = df[pair[1]].sort_values(ascending=False).head(n_top).index
|
|
96
|
-
|
|
97
|
-
|
|
117
|
+
significant_indices = extreme_indices_0.union(extreme_indices_1)
|
|
118
|
+
|
|
119
|
+
bg_df = df.drop(index=significant_indices)
|
|
120
|
+
sig_df = df.loc[significant_indices]
|
|
121
|
+
|
|
98
122
|
fig, ax = plt.subplots()
|
|
123
|
+
|
|
124
|
+
# Background cloud (filled dots with black borders, not rasterized)
|
|
125
|
+
bg_sizes = (bg_df['n_used_genes'] if 'n_used_genes' in bg_df else pd.Series(1, index=bg_df.index)) * 5
|
|
126
|
+
ax.scatter(
|
|
127
|
+
bg_df[pair[0]], bg_df[pair[1]],
|
|
128
|
+
facecolors=nonsig_color, edgecolors=border_color,
|
|
129
|
+
s=bg_sizes, linewidth=border_width, alpha=1.0,
|
|
130
|
+
zorder=0
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Significant points (filled dots with black borders)
|
|
134
|
+
sig_sizes = (sig_df['n_used_genes'] if 'n_used_genes' in sig_df else pd.Series(1, index=sig_df.index)) * 8
|
|
135
|
+
ax.scatter(
|
|
136
|
+
sig_df[pair[0]], sig_df[pair[1]],
|
|
137
|
+
facecolors=sig_color, edgecolors=border_color,
|
|
138
|
+
s=sig_sizes, linewidth=border_width, zorder=2
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Label only significant with adaptive spacing
|
|
142
|
+
coords = sorted(
|
|
143
|
+
[(sig_df.loc[idx, pair[0]], sig_df.loc[idx, pair[1]], idx) for idx in sig_df.index],
|
|
144
|
+
key=lambda c: (-c[1], -c[0])
|
|
145
|
+
)
|
|
99
146
|
|
|
100
|
-
#
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
marker='o',
|
|
105
|
-
s=sizes,
|
|
106
|
-
linewidth=0.7,
|
|
107
|
-
zorder=1)
|
|
147
|
+
# Calculate proper parameters for normalized coordinate system (0-1 range)
|
|
148
|
+
max_y = 1.0 # Normalized plots use 0-1 range
|
|
149
|
+
scale_factor = 1.0 # Standard scaling for normalized plots
|
|
150
|
+
min_distance = 0.05 # Appropriate spacing for 0-1 range
|
|
108
151
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
edgecolors='black',
|
|
116
|
-
s=sig_sizes,
|
|
117
|
-
linewidth=0.1,
|
|
118
|
-
zorder=2)
|
|
119
|
-
|
|
120
|
-
all_points = list(zip(df[pair[0]], df[pair[1]]))
|
|
121
|
-
coords = sorted([(df.loc[idx, pair[0]], df.loc[idx, pair[1]], idx)
|
|
122
|
-
for idx in significant_indices], key=lambda c: (-c[1], -c[0]))
|
|
152
|
+
adjusted_coords = adjust_text_positions(
|
|
153
|
+
coords, sig_sizes,
|
|
154
|
+
min_distance=min_distance,
|
|
155
|
+
max_y=max_y,
|
|
156
|
+
scale_factor=scale_factor
|
|
157
|
+
)
|
|
123
158
|
|
|
124
|
-
adjusted_coords = adjust_text_positions(coords, sig_sizes)
|
|
125
|
-
|
|
126
|
-
# Draw vertical lines and right-aligned text
|
|
127
159
|
for x, adj_y, idx in adjusted_coords:
|
|
128
160
|
y = df.loc[idx, pair[1]]
|
|
129
|
-
ax.plot([x, x], [y, adj_y],
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
zorder=4,
|
|
142
|
-
bbox=dict(facecolor="white", alpha=0.8, edgecolor="none", pad=1.5)
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
# Reference line and labels
|
|
146
|
-
ax.plot([0, 1], [0, 1],
|
|
147
|
-
linestyle='-',
|
|
148
|
-
color='lightgray',
|
|
149
|
-
alpha=0.4,
|
|
150
|
-
zorder=0)
|
|
151
|
-
|
|
152
|
-
# Add padding to axes for better visibility of points near edges
|
|
153
|
-
padding = 0.02 # Small offset (adjust as needed, e.g., 0.05 for more space)
|
|
161
|
+
ax.plot([x, x], [y, adj_y], color=label_color, linewidth=0.6, alpha=0.3, zorder=3)
|
|
162
|
+
ax.text(
|
|
163
|
+
x, adj_y + 0.005,
|
|
164
|
+
df.loc[idx, 'Name'][:15] + '..',
|
|
165
|
+
fontsize=4, ha='left', va='bottom', color=label_color,
|
|
166
|
+
linespacing=1.5, zorder=4,
|
|
167
|
+
#bbox=dict(facecolor="white", alpha=0.65, edgecolor="white", pad=1.2)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Diagonal & axes cosmetics
|
|
171
|
+
ax.plot([0, 1], [0, 1], linestyle='-', color='lightgray', alpha=0.4, linewidth=0.5, zorder=1)
|
|
172
|
+
padding = 0.02
|
|
154
173
|
ax.set_xlim(-padding, 1 + padding)
|
|
155
174
|
ax.set_ylim(-padding, 1 + padding)
|
|
156
|
-
|
|
157
|
-
# Labels use rcParams sizes automatically
|
|
158
175
|
ax.set_xlabel(f"{pair[0]} PR-AUC score")
|
|
159
176
|
ax.set_ylabel(f"{pair[1]} PR-AUC score")
|
|
160
177
|
ax.set_title(f"{pair[0]} vs {pair[1]} - Comparison of complex performance")
|
|
161
|
-
|
|
178
|
+
|
|
179
|
+
# Nature style: no grid, open top/right spines
|
|
180
|
+
ax.grid(False)
|
|
181
|
+
ax.spines['top'].set_visible(False)
|
|
182
|
+
ax.spines['right'].set_visible(False)
|
|
183
|
+
|
|
162
184
|
plt.tight_layout()
|
|
163
|
-
|
|
164
|
-
# Save handling
|
|
185
|
+
|
|
165
186
|
if plot_config["save_plot"]:
|
|
166
187
|
output_type = plot_config["output_type"]
|
|
167
188
|
output_path = Path(config["output_folder"]) / f"percomplex_scatter_{pair[0]}_vs_{pair[1]}.{output_type}"
|
|
168
189
|
fig.savefig(output_path, bbox_inches="tight", format=output_type)
|
|
169
|
-
|
|
190
|
+
|
|
170
191
|
if plot_config.get("show_plot", True):
|
|
171
192
|
plt.show()
|
|
172
|
-
|
|
193
|
+
|
|
173
194
|
plt.close(fig)
|
|
174
195
|
|
|
175
196
|
|
|
176
197
|
|
|
198
|
+
def smart_direction_assignment(point_y, y_max, min_safe_distance=20.0):
|
|
199
|
+
"""Determine the best direction for label placement based on Y position."""
|
|
200
|
+
lower_threshold = y_max / 3
|
|
201
|
+
upper_threshold = 2 * y_max / 3
|
|
202
|
+
|
|
203
|
+
if point_y < lower_threshold:
|
|
204
|
+
return "up_only"
|
|
205
|
+
elif point_y > upper_threshold:
|
|
206
|
+
return "prefer_down"
|
|
207
|
+
else:
|
|
208
|
+
return "both_directions"
|
|
177
209
|
|
|
178
210
|
|
|
211
|
+
def group_points_by_y_proximity(coords, y_tolerance=5.0):
|
|
212
|
+
"""Group points that have similar Y values (within tolerance)."""
|
|
213
|
+
groups = []
|
|
214
|
+
remaining_coords = coords.copy()
|
|
215
|
+
|
|
216
|
+
while remaining_coords:
|
|
217
|
+
# Start a new group with the first remaining point
|
|
218
|
+
seed_point = remaining_coords.pop(0)
|
|
219
|
+
current_group = [seed_point]
|
|
220
|
+
seed_y = seed_point[1]
|
|
221
|
+
|
|
222
|
+
# Find all points within Y tolerance of the seed point
|
|
223
|
+
i = 0
|
|
224
|
+
while i < len(remaining_coords):
|
|
225
|
+
if abs(remaining_coords[i][1] - seed_y) <= y_tolerance:
|
|
226
|
+
current_group.append(remaining_coords.pop(i))
|
|
227
|
+
else:
|
|
228
|
+
i += 1
|
|
229
|
+
|
|
230
|
+
groups.append(current_group)
|
|
231
|
+
|
|
232
|
+
return groups
|
|
179
233
|
|
|
180
234
|
|
|
181
235
|
def adjust_text_positions(coords, sizes, min_distance=0.08, max_y=1.0, scale_factor=1.0):
|
|
236
|
+
"""Enhanced text positioning with adaptive spacing for dense clusters."""
|
|
182
237
|
adjusted = []
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
238
|
+
|
|
239
|
+
# Fix scaling issues - use data coordinates, not pixel scaling
|
|
240
|
+
if max_y > 10: # For gene count plots (large Y values)
|
|
241
|
+
text_height = max_y * 0.02 # 2% of Y range
|
|
242
|
+
min_safe_distance = max_y * 0.05 # 5% of Y range
|
|
243
|
+
y_tolerance = max_y * 0.02 # 2% of Y range for grouping
|
|
244
|
+
else: # For normalized plots (Y values 0-1)
|
|
245
|
+
text_height = 0.04 * scale_factor
|
|
246
|
+
min_safe_distance = 20 * scale_factor
|
|
247
|
+
y_tolerance = 5 * scale_factor
|
|
248
|
+
|
|
249
|
+
# Group points by Y proximity
|
|
250
|
+
groups = group_points_by_y_proximity(coords, y_tolerance)
|
|
251
|
+
|
|
252
|
+
for group in groups:
|
|
253
|
+
group_size = len(group)
|
|
254
|
+
|
|
255
|
+
# Calculate adaptive spacing based on cluster density
|
|
256
|
+
density_multiplier = calculate_density_multiplier(group_size)
|
|
257
|
+
|
|
258
|
+
if group_size == 1:
|
|
259
|
+
# Single point - use original logic but with direction awareness
|
|
260
|
+
x, y, idx = group[0]
|
|
261
|
+
direction = smart_direction_assignment(y, max_y, min_safe_distance)
|
|
262
|
+
|
|
263
|
+
# Use reasonable base offset relative to Y range
|
|
264
|
+
if max_y > 10: # Gene count plots
|
|
265
|
+
base_offset = max(3, max_y * 0.03) # 3% of Y range, minimum 3 units
|
|
266
|
+
else: # Normalized plots
|
|
267
|
+
base_offset = np.sqrt(sizes.loc[idx]) * 0.04 * scale_factor if idx in sizes else 0.04 * scale_factor
|
|
268
|
+
|
|
269
|
+
if direction == "up_only" or direction == "both_directions":
|
|
270
|
+
adj_y = y + base_offset
|
|
271
|
+
elif direction == "prefer_down" and y - base_offset > min_safe_distance:
|
|
272
|
+
adj_y = y - base_offset
|
|
209
273
|
else:
|
|
210
|
-
|
|
211
|
-
|
|
274
|
+
adj_y = y + base_offset
|
|
275
|
+
|
|
276
|
+
# Ensure within bounds with proper limits
|
|
277
|
+
adj_y = max(min_safe_distance, min(adj_y, max_y - text_height))
|
|
278
|
+
|
|
279
|
+
# Additional safety check to prevent extreme values
|
|
280
|
+
if adj_y < 0 or adj_y > max_y * 1.2: # Allow 20% overflow for safety
|
|
281
|
+
adj_y = y + base_offset # Fallback to simple offset
|
|
282
|
+
|
|
283
|
+
adjusted.append((x, adj_y, idx))
|
|
284
|
+
|
|
285
|
+
else:
|
|
286
|
+
# Multiple points with similar Y - use adaptive distribution
|
|
287
|
+
group.sort(key=lambda p: p[0]) # Sort by X coordinate
|
|
288
|
+
|
|
289
|
+
# Determine available directions for this Y level
|
|
290
|
+
group_y = group[0][1] # All have similar Y, use first as representative
|
|
291
|
+
direction = smart_direction_assignment(group_y, max_y, min_safe_distance)
|
|
292
|
+
|
|
293
|
+
# Calculate adaptive spacing and base offset
|
|
294
|
+
adaptive_spacing = calculate_adaptive_spacing(
|
|
295
|
+
group_size, min_distance, text_height, max_y, density_multiplier
|
|
296
|
+
)
|
|
297
|
+
adaptive_base_offset = calculate_adaptive_base_offset(
|
|
298
|
+
group_size, max_y, scale_factor, density_multiplier
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
for i, (x, y, idx) in enumerate(group):
|
|
302
|
+
if direction == "up_only":
|
|
303
|
+
# Stack all labels upward with adaptive spacing
|
|
304
|
+
adj_y = y + adaptive_base_offset + (i * adaptive_spacing)
|
|
305
|
+
|
|
306
|
+
elif direction == "prefer_down":
|
|
307
|
+
# Alternate down and up with adaptive spacing
|
|
308
|
+
if i % 2 == 0 and y - adaptive_base_offset - (i//2 * adaptive_spacing) > min_safe_distance:
|
|
309
|
+
# Even indices go down
|
|
310
|
+
adj_y = y - adaptive_base_offset - (i//2 * adaptive_spacing)
|
|
311
|
+
else:
|
|
312
|
+
# Odd indices or insufficient space below - go up
|
|
313
|
+
up_level = (i//2) if i % 2 == 0 else ((i+1)//2)
|
|
314
|
+
adj_y = y + adaptive_base_offset + (up_level * adaptive_spacing)
|
|
315
|
+
|
|
316
|
+
else: # both_directions
|
|
317
|
+
# Alternate up and down with adaptive spacing
|
|
318
|
+
if i % 2 == 0:
|
|
319
|
+
# Even indices go up
|
|
320
|
+
adj_y = y + adaptive_base_offset + (i//2 * adaptive_spacing)
|
|
321
|
+
else:
|
|
322
|
+
# Odd indices go down (if safe)
|
|
323
|
+
potential_down = y - adaptive_base_offset - ((i+1)//2 * adaptive_spacing)
|
|
324
|
+
if potential_down > min_safe_distance:
|
|
325
|
+
adj_y = potential_down
|
|
326
|
+
else:
|
|
327
|
+
# Not safe to go down, stack upward instead
|
|
328
|
+
adj_y = y + adaptive_base_offset + (i//2 * adaptive_spacing)
|
|
329
|
+
|
|
330
|
+
# Final bounds check with stricter limits
|
|
331
|
+
adj_y = max(min_safe_distance, min(adj_y, max_y - text_height))
|
|
332
|
+
|
|
333
|
+
# Additional safety check to prevent extreme values
|
|
334
|
+
if adj_y < 0 or adj_y > max_y * 1.2: # Allow 20% overflow for safety
|
|
335
|
+
adj_y = y + adaptive_base_offset # Fallback to simple offset
|
|
336
|
+
|
|
212
337
|
adjusted.append((x, adj_y, idx))
|
|
213
|
-
|
|
214
|
-
|
|
338
|
+
|
|
215
339
|
return adjusted
|
|
216
340
|
|
|
217
341
|
|
|
342
|
+
def calculate_density_multiplier(group_size):
|
|
343
|
+
"""Calculate multiplier for spacing based on cluster density."""
|
|
344
|
+
if group_size <= 3:
|
|
345
|
+
return 1.0
|
|
346
|
+
elif group_size <= 6:
|
|
347
|
+
return 1.3
|
|
348
|
+
elif group_size <= 10:
|
|
349
|
+
return 1.6
|
|
350
|
+
elif group_size <= 15:
|
|
351
|
+
return 2.0
|
|
352
|
+
elif group_size <= 20:
|
|
353
|
+
return 2.5
|
|
354
|
+
else: # 20+ points
|
|
355
|
+
return 3.0 + (group_size - 20) * 0.1 # Progressive scaling for very dense clusters
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def calculate_adaptive_spacing(group_size, min_distance, text_height, max_y, density_multiplier):
|
|
359
|
+
"""Calculate adaptive vertical spacing between labels based on cluster density."""
|
|
360
|
+
base_spacing = max(min_distance, text_height * 1.5)
|
|
361
|
+
|
|
362
|
+
# Scale spacing based on density and coordinate system
|
|
363
|
+
if max_y > 10: # Gene count plots
|
|
364
|
+
adaptive_spacing = base_spacing * density_multiplier * (max_y / 100.0)
|
|
365
|
+
# Ensure minimum readable spacing for dense clusters
|
|
366
|
+
adaptive_spacing = max(adaptive_spacing, max_y * 0.03)
|
|
367
|
+
else: # Normalized plots
|
|
368
|
+
adaptive_spacing = base_spacing * density_multiplier
|
|
369
|
+
# Ensure minimum readable spacing
|
|
370
|
+
adaptive_spacing = max(adaptive_spacing, 0.05)
|
|
371
|
+
|
|
372
|
+
return adaptive_spacing
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def calculate_adaptive_base_offset(group_size, max_y, scale_factor, density_multiplier):
|
|
376
|
+
"""Calculate adaptive base offset (connector line height) based on cluster density."""
|
|
377
|
+
if max_y > 10: # Gene count plots
|
|
378
|
+
base_offset = max(3, max_y * 0.03)
|
|
379
|
+
# Increase connector line height for dense clusters
|
|
380
|
+
adaptive_offset = base_offset * density_multiplier
|
|
381
|
+
# Cap to reasonable maximum
|
|
382
|
+
adaptive_offset = min(adaptive_offset, max_y * 0.15)
|
|
383
|
+
else: # Normalized plots
|
|
384
|
+
base_offset = 0.04 * scale_factor
|
|
385
|
+
# Increase connector line height for dense clusters
|
|
386
|
+
adaptive_offset = base_offset * density_multiplier
|
|
387
|
+
# Cap to reasonable maximum
|
|
388
|
+
adaptive_offset = min(adaptive_offset, 0.2)
|
|
389
|
+
|
|
390
|
+
return adaptive_offset
|
|
391
|
+
|
|
218
392
|
|
|
219
|
-
def plot_percomplex_scatter_bysize():
|
|
393
|
+
def plot_percomplex_scatter_bysize(n_labels=10, n_top=10, sig_color='#B71A2A', nonsig_color='#DBDDDD', label_color='black', border_color='black', border_width=1.0):
|
|
220
394
|
config = dload("config")
|
|
221
395
|
plot_config = config["plotting"]
|
|
222
396
|
rdict = dload("pra_percomplex")
|
|
223
|
-
|
|
397
|
+
|
|
224
398
|
for key, per_complex in rdict.items():
|
|
225
399
|
sorted_pc = per_complex.sort_values(by="auc_score", ascending=False, na_position="last")
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
# Create figure using rcParams defaults
|
|
400
|
+
top_labels, rest = sorted_pc.head(n_labels), sorted_pc.iloc[n_labels:]
|
|
401
|
+
|
|
229
402
|
fig, ax = plt.subplots()
|
|
230
|
-
|
|
231
|
-
#
|
|
403
|
+
|
|
404
|
+
# Background (REST): filled dots with black borders, not rasterized
|
|
232
405
|
ax.scatter(
|
|
233
|
-
rest.auc_score, rest.n_used_genes,
|
|
234
|
-
edgecolors=
|
|
235
|
-
linewidth=
|
|
236
|
-
|
|
237
|
-
|
|
406
|
+
rest.auc_score, rest.n_used_genes,
|
|
407
|
+
facecolors=nonsig_color, edgecolors=border_color,
|
|
408
|
+
linewidth=border_width, s=rest.n_used_genes * 10,
|
|
409
|
+
alpha=1.0, label="Other Complexes",
|
|
410
|
+
zorder=0
|
|
238
411
|
)
|
|
239
|
-
|
|
240
|
-
# Top
|
|
412
|
+
|
|
413
|
+
# Top N: filled dots with black borders
|
|
241
414
|
ax.scatter(
|
|
242
|
-
|
|
243
|
-
facecolors=
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
s=top_10.n_used_genes * 10,
|
|
247
|
-
label="Top 10 AUC Scores"
|
|
415
|
+
top_labels.auc_score, top_labels.n_used_genes,
|
|
416
|
+
facecolors=sig_color, edgecolors=border_color,
|
|
417
|
+
linewidth=border_width, s=top_labels.n_used_genes * 10,
|
|
418
|
+
label=f"Top {n_labels} AUC Scores", alpha=1.0, zorder=2
|
|
248
419
|
)
|
|
249
420
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
421
|
+
# Labels with corrected scaling
|
|
422
|
+
coords = [(row.auc_score, row.n_used_genes, idx) for idx, row in top_labels.iterrows()]
|
|
423
|
+
sizes = top_labels.n_used_genes * 10
|
|
424
|
+
max_y = sorted_pc.n_used_genes.max() + 50
|
|
254
425
|
|
|
255
|
-
#
|
|
256
|
-
|
|
257
|
-
scale_factor = max_y / 1.0 # Scale offsets for visibility on new range
|
|
258
|
-
adjusted_coords = adjust_text_positions(
|
|
259
|
-
coords, sizes,
|
|
260
|
-
min_distance=0.08 * scale_factor,
|
|
261
|
-
max_y=max_y,
|
|
262
|
-
scale_factor=scale_factor # New param to make lengths visible
|
|
263
|
-
)
|
|
426
|
+
# Fix scaling issue - use reasonable scale factor
|
|
427
|
+
scale_factor = min(max_y / 100.0, 3.0) # Cap scale factor to prevent extreme positioning
|
|
264
428
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
if x < 0.5
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
bbox=dict(facecolor="white", alpha=0.8, edgecolor="none", pad=1.5))
|
|
289
|
-
|
|
290
|
-
# Axis configuration (integer ticks now on Y)
|
|
291
|
-
ax.yaxis.get_major_locator().set_params(integer=True)
|
|
292
|
-
ax.set_xlabel("PR-AUC score") # Swapped label
|
|
293
|
-
ax.set_ylabel("Number of genes in the complex") # Swapped label
|
|
429
|
+
adjusted = adjust_text_positions(
|
|
430
|
+
coords, sizes,
|
|
431
|
+
min_distance=max(5.0, max_y * 0.02), # Use reasonable spacing relative to Y range
|
|
432
|
+
max_y=max_y,
|
|
433
|
+
scale_factor=scale_factor
|
|
434
|
+
)
|
|
435
|
+
for x, adj_y, idx in adjusted:
|
|
436
|
+
y = top_labels.loc[idx, "n_used_genes"]
|
|
437
|
+
ax.plot([x, x], [y, adj_y], color=label_color, linewidth=0.5, alpha=0.3, zorder=3)
|
|
438
|
+
ha = 'left' if x < 0.5 else 'right'
|
|
439
|
+
text_x = x + 0.01 if x < 0.5 else x - 0.01
|
|
440
|
+
ax.text(
|
|
441
|
+
text_x, adj_y + (0.005 * scale_factor),
|
|
442
|
+
top_labels.loc[idx, 'Name'][:15] + '..',
|
|
443
|
+
fontsize=4, ha=ha, va='bottom', color=label_color, linespacing=1.5, zorder=4,
|
|
444
|
+
bbox=dict(facecolor="white", alpha=0.65, edgecolor="white", pad=1.5)
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Set y-axis to show integer values only
|
|
448
|
+
from matplotlib.ticker import MaxNLocator
|
|
449
|
+
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
|
|
450
|
+
ax.set_xlabel("PR-AUC score")
|
|
451
|
+
ax.set_ylabel("Number of genes in the complex")
|
|
294
452
|
ax.set_title(f"{key} - Complex performance: PR-AUC score vs complex size")
|
|
295
|
-
ax.grid(False)
|
|
296
|
-
|
|
297
|
-
# Fixed limits (X fixed to 0-1 for AUC, Y with buffer for lines/labels)
|
|
298
|
-
ax.set_xlim(0, 1.0)
|
|
299
|
-
ax.set_ylim(0, max_y)
|
|
300
453
|
|
|
301
|
-
#
|
|
302
|
-
|
|
454
|
+
# No ruler + open spines
|
|
455
|
+
ax.grid(visible=False, which='both', axis='both')
|
|
456
|
+
ax.set_xlim(0, 1.0); ax.set_ylim(0, max_y)
|
|
457
|
+
ax.spines['top'].set_visible(False)
|
|
458
|
+
ax.spines['right'].set_visible(False)
|
|
303
459
|
|
|
460
|
+
plt.subplots_adjust(right=0.8)
|
|
304
461
|
plt.tight_layout()
|
|
305
462
|
|
|
306
|
-
# Save handling (dpi comes from rcParams)
|
|
307
463
|
if plot_config["save_plot"]:
|
|
308
464
|
output_type = plot_config["output_type"]
|
|
309
465
|
output_path = Path(config["output_folder"]) / f"percomplex_scatter_by_complexsize_{key}.{output_type}"
|
|
@@ -311,20 +467,28 @@ def plot_percomplex_scatter_bysize():
|
|
|
311
467
|
|
|
312
468
|
if plot_config.get("show_plot", True):
|
|
313
469
|
plt.show()
|
|
314
|
-
|
|
315
470
|
plt.close(fig)
|
|
316
471
|
|
|
317
472
|
|
|
318
473
|
|
|
319
|
-
def plot_complex_contributions(
|
|
474
|
+
def plot_complex_contributions(
|
|
475
|
+
min_pairs=10,
|
|
476
|
+
min_precision_cutoff=0.5,
|
|
477
|
+
num_complex_to_show=10,
|
|
478
|
+
y_lim=None,
|
|
479
|
+
fig_title=None,
|
|
480
|
+
fig_labs=['Fraction of TP', 'Precision'],
|
|
481
|
+
legend_rows=3, # <— NEW: rows for legend layout (try 3 or 4)
|
|
482
|
+
):
|
|
320
483
|
config = dload("config")
|
|
321
484
|
plot_config = config["plotting"]
|
|
322
485
|
plot_data_dict = dload("complex_contributions")
|
|
486
|
+
|
|
323
487
|
for key, plot_data in plot_data_dict.items():
|
|
324
488
|
s = plot_data.set_index('Name').sum()
|
|
325
489
|
find_last_precision = s[s > min_pairs].index[-1]
|
|
326
|
-
last_prec_value = float(find_last_precision.split('_')[1])
|
|
327
|
-
|
|
490
|
+
last_prec_value = float(find_last_precision.split('_')[1])
|
|
491
|
+
|
|
328
492
|
plot_data = plot_data.drop_duplicates(subset='Name')
|
|
329
493
|
cont_stepwise_anno = plot_data['Name']
|
|
330
494
|
cont_stepwise_mat = plot_data.drop(columns=['Name'])
|
|
@@ -340,117 +504,117 @@ def plot_complex_contributions(min_pairs=10, min_precision_cutoff=0.5, num_compl
|
|
|
340
504
|
x_df = pd.DataFrame(x, index=cont_stepwise_anno, columns=cont_stepwise_mat.columns)
|
|
341
505
|
ind_for_mean = y >= (last_prec_value - min_precision_cutoff)
|
|
342
506
|
if sum(ind_for_mean) == 0:
|
|
343
|
-
log.info("No values above 'min.precision.cutoff'")
|
|
344
|
-
return False
|
|
507
|
+
log.info("No values above 'min.precision.cutoff'"); return False
|
|
345
508
|
if sum(ind_for_mean) == 1:
|
|
346
|
-
log.info("Only one value above 'min.precision.cutoff'
|
|
347
|
-
|
|
348
|
-
# Select top complexes
|
|
509
|
+
log.info("Only one value above 'min.precision.cutoff'"); return False
|
|
510
|
+
|
|
349
511
|
a = x_df.loc[:, ind_for_mean].mean(axis=1).sort_values()[-num_complex_to_show:]
|
|
350
512
|
subset = x_df.loc[a.index, :]
|
|
351
|
-
|
|
352
|
-
cmap = plt.get_cmap()
|
|
513
|
+
|
|
514
|
+
cmap = plt.get_cmap()
|
|
353
515
|
colors = cmap(np.linspace(0, 1, num_complex_to_show))
|
|
354
|
-
colors = np.vstack(([0.5, 0.5, 0.5, 1.0], colors))
|
|
516
|
+
colors = np.vstack(([0.5, 0.5, 0.5, 1.0], colors)) # 'others' + top K
|
|
355
517
|
others = pd.DataFrame(1 - subset.sum(axis=0), columns=['others']).T
|
|
356
518
|
merged = pd.concat([others, subset], ignore_index=False)
|
|
357
|
-
|
|
358
|
-
x1 = np.zeros_like(
|
|
359
|
-
|
|
360
|
-
for i in range(x.shape[0]):
|
|
519
|
+
X = merged.to_numpy()
|
|
520
|
+
x1 = np.zeros_like(X); x2 = np.zeros_like(X)
|
|
521
|
+
for i in range(X.shape[0]):
|
|
361
522
|
if i == 0:
|
|
362
|
-
x2[i, :] =
|
|
523
|
+
x2[i, :] = X[0, :]
|
|
363
524
|
elif i == 1:
|
|
364
|
-
x1[i, :] =
|
|
525
|
+
x1[i, :] = X[0, :]
|
|
365
526
|
else:
|
|
366
|
-
x1[i, :] =
|
|
527
|
+
x1[i, :] = X[:i, :].sum(axis=0)
|
|
367
528
|
if i > 0:
|
|
368
|
-
x2[i, :] =
|
|
369
|
-
|
|
529
|
+
x2[i, :] = X[:i + 1, :].sum(axis=0)
|
|
370
530
|
|
|
371
|
-
|
|
372
|
-
padding = 0.02 # Small padding to avoid clipping (adjust as needed, e.g., 0.05 for more space)
|
|
531
|
+
padding = 0.02
|
|
373
532
|
lower = max(0, min(y) - padding)
|
|
374
|
-
upper = last_prec_value + padding
|
|
533
|
+
upper = last_prec_value + padding
|
|
375
534
|
y_lim = (lower, upper)
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
fig, ax = plt.subplots(2, 1, gridspec_kw={'height_ratios': [5, 1]})
|
|
535
|
+
|
|
536
|
+
# Give legend a bit more room
|
|
537
|
+
fig, ax = plt.subplots(2, 1, gridspec_kw={'height_ratios': [5, 1.8]})
|
|
379
538
|
ax[0].set_xlim(0, 1)
|
|
380
|
-
ax[0].set_ylim(*y_lim)
|
|
539
|
+
ax[0].set_ylim(*y_lim)
|
|
381
540
|
ax[0].set_xlabel(fig_labs[0])
|
|
382
541
|
ax[0].set_ylabel(fig_labs[1])
|
|
383
542
|
ax[0].set_title(fig_title if fig_title else f"{key} - Contribution of complexes")
|
|
384
|
-
for i in range(
|
|
543
|
+
for i in range(X.shape[0]):
|
|
385
544
|
ax[0].fill_betweenx(y, x1[i, :], x2[i, :], color=colors[i], edgecolor='white')
|
|
386
545
|
|
|
387
|
-
# Legend
|
|
388
|
-
|
|
389
|
-
|
|
546
|
+
# Legend: multi-row, constrained to width
|
|
547
|
+
def _short(s, n=14): return (s[:n-1] + '…') if len(s) > n else s
|
|
548
|
+
labels = [_short(lbl) for lbl in merged.index]
|
|
549
|
+
handles = [patches.Patch(color=colors[i], label=labels[i]) for i in range(len(labels))]
|
|
390
550
|
ax[1].axis('off')
|
|
391
|
-
|
|
551
|
+
n_items = len(handles)
|
|
552
|
+
ncols = int(np.ceil(n_items / max(1, legend_rows))) # spread across rows
|
|
553
|
+
ax[1].legend(
|
|
554
|
+
handles=handles,
|
|
555
|
+
loc='center',
|
|
556
|
+
ncol=ncols,
|
|
557
|
+
frameon=False,
|
|
558
|
+
title="Complexes",
|
|
559
|
+
fontsize=6, title_fontsize=6,
|
|
560
|
+
handlelength=0.9, handletextpad=0.25,
|
|
561
|
+
borderaxespad=0.0,
|
|
562
|
+
labelspacing=0.25, columnspacing=0.6,
|
|
563
|
+
mode='expand'
|
|
564
|
+
)
|
|
565
|
+
|
|
392
566
|
plt.tight_layout()
|
|
393
567
|
|
|
394
|
-
# Save handling (remove explicit dpi)
|
|
395
568
|
if plot_config["save_plot"]:
|
|
396
|
-
output_type
|
|
397
|
-
output_folder
|
|
398
|
-
output_path
|
|
569
|
+
output_type = plot_config["output_type"]
|
|
570
|
+
output_folder= Path(config["output_folder"])
|
|
571
|
+
output_path = output_folder / f"complex_contributions_{key}.{output_type}"
|
|
399
572
|
fig.savefig(output_path, bbox_inches="tight", format=output_type)
|
|
400
573
|
|
|
401
574
|
if plot_config.get("show_plot", True):
|
|
402
575
|
plt.show()
|
|
403
|
-
|
|
404
576
|
plt.close(fig)
|
|
405
577
|
|
|
406
578
|
|
|
579
|
+
|
|
407
580
|
def plot_significant_complexes():
|
|
408
581
|
config = dload("config")
|
|
409
582
|
plot_config = config["plotting"]
|
|
410
583
|
pra_percomplex = dload("pra_percomplex")
|
|
411
584
|
|
|
412
|
-
# Define thresholds and prepare data
|
|
413
585
|
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
|
|
414
586
|
datasets = list(pra_percomplex.keys())
|
|
415
587
|
num_datasets = len(datasets)
|
|
416
588
|
|
|
417
|
-
# Create a DataFrame to store results
|
|
418
589
|
df = pd.DataFrame(index=thresholds)
|
|
419
590
|
for key, complex_data in pra_percomplex.items():
|
|
420
591
|
df[key] = [complex_data.query(f'auc_score >= {t}').shape[0] for t in thresholds]
|
|
421
592
|
|
|
422
|
-
# Create figure
|
|
423
593
|
fig, ax = plt.subplots()
|
|
424
594
|
|
|
425
|
-
# Use colormap from rcParams
|
|
426
595
|
cmap = plt.get_cmap()
|
|
427
596
|
colors = [cmap(i / (num_datasets + 1)) for i in range(1, num_datasets + 1)]
|
|
428
597
|
|
|
429
|
-
|
|
430
|
-
bar_width = 0.8 / num_datasets # Dynamic width based on dataset count
|
|
598
|
+
bar_width = 0.8 / num_datasets
|
|
431
599
|
for i, dataset in enumerate(datasets):
|
|
432
600
|
x = np.arange(len(thresholds)) + i * bar_width
|
|
433
601
|
ax.bar(x, df[dataset], width=bar_width, color=colors[i], edgecolor='black', label=dataset)
|
|
434
602
|
|
|
435
|
-
# Customize x-axis labels
|
|
436
603
|
ax.set_xticks(np.arange(len(thresholds)) + (num_datasets - 1) * bar_width / 2)
|
|
437
|
-
ax.set_xticklabels(thresholds, rotation=0, ha='center')
|
|
604
|
+
ax.set_xticklabels([str(t) for t in thresholds], rotation=0, ha='center')
|
|
438
605
|
|
|
439
|
-
# Set title and axis labels (handled by rcParams)
|
|
440
606
|
ax.set_title("Number of significant complexes above PR-AUC thresholds")
|
|
441
607
|
ax.set_xlabel("PR-AUC score thresholds")
|
|
442
608
|
ax.set_ylabel("Number of complexes")
|
|
443
609
|
|
|
444
|
-
#
|
|
445
|
-
ax.grid(
|
|
610
|
+
# Nature style: no grid; open top/right spines
|
|
611
|
+
ax.grid(False)
|
|
612
|
+
for spine in ('right', 'top'):
|
|
613
|
+
ax.spines[spine].set_visible(False)
|
|
446
614
|
|
|
447
|
-
|
|
448
|
-
ax.legend(loc='upper right')
|
|
449
|
-
|
|
450
|
-
# Adjust layout
|
|
615
|
+
ax.legend(loc='upper right', frameon=False)
|
|
451
616
|
plt.tight_layout()
|
|
452
617
|
|
|
453
|
-
# Save figure if required
|
|
454
618
|
if plot_config["save_plot"]:
|
|
455
619
|
output_type = plot_config["output_type"]
|
|
456
620
|
output_folder = Path(config["output_folder"])
|
|
@@ -459,7 +623,7 @@ def plot_significant_complexes():
|
|
|
459
623
|
|
|
460
624
|
if plot_config.get("show_plot", True):
|
|
461
625
|
plt.show()
|
|
462
|
-
|
|
626
|
+
|
|
463
627
|
plt.close(fig)
|
|
464
628
|
return df
|
|
465
629
|
|
|
@@ -470,34 +634,30 @@ def plot_auc_scores():
|
|
|
470
634
|
plot_config = config["plotting"]
|
|
471
635
|
pra_dict = dload("pr_auc")
|
|
472
636
|
|
|
473
|
-
|
|
474
637
|
sorted_items = sorted(pra_dict.items(), key=lambda x: x[1], reverse=True)
|
|
475
638
|
datasets = [k for k, _ in sorted_items]
|
|
476
639
|
auc_scores = [v for _, v in sorted_items]
|
|
477
640
|
|
|
478
|
-
# Create figure and axis
|
|
479
641
|
fig, ax = plt.subplots()
|
|
480
642
|
|
|
481
|
-
# Use colormap from rcParams
|
|
482
643
|
cmap = plt.get_cmap()
|
|
483
644
|
num_datasets = len(datasets)
|
|
484
645
|
colors = [cmap(i / (num_datasets + 1)) for i in range(1, num_datasets + 1)]
|
|
485
646
|
|
|
486
|
-
# Plot bars
|
|
487
647
|
ax.bar(datasets, auc_scores, color=colors, edgecolor="black")
|
|
488
648
|
|
|
489
|
-
# Set y-axis limits dynamically
|
|
490
649
|
ax.set_ylim(0, max(auc_scores) + 0.01)
|
|
491
|
-
|
|
492
|
-
# Set title and labels
|
|
493
650
|
ax.set_title("AUC scores for the datasets")
|
|
494
651
|
ax.set_ylabel("AUC score")
|
|
495
652
|
plt.xticks(rotation=45, ha="right")
|
|
496
653
|
|
|
497
|
-
#
|
|
498
|
-
ax.grid(axis='
|
|
654
|
+
# Hard-disable any grid/ruler
|
|
655
|
+
ax.grid(visible=False, which='both', axis='both')
|
|
656
|
+
ax.set_axisbelow(False) # make sure nothing faint is drawn beneath
|
|
657
|
+
# Open spines
|
|
658
|
+
ax.spines['top'].set_visible(False)
|
|
659
|
+
ax.spines['right'].set_visible(False)
|
|
499
660
|
|
|
500
|
-
# Save the figure if required
|
|
501
661
|
if plot_config["save_plot"]:
|
|
502
662
|
output_type = plot_config["output_type"]
|
|
503
663
|
output_folder = Path(config["output_folder"])
|
|
@@ -507,6 +667,6 @@ def plot_auc_scores():
|
|
|
507
667
|
|
|
508
668
|
if plot_config.get("show_plot", True):
|
|
509
669
|
plt.show()
|
|
510
|
-
|
|
670
|
+
|
|
511
671
|
plt.close(fig)
|
|
512
672
|
return pra_dict
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pythonflex
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2
|
|
4
4
|
Summary: pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data.
|
|
5
5
|
Author-email: Yasir Demirtaş <tyasird@hotmail.com>
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
pythonflex/__init__.py,sha256=
|
|
2
|
-
pythonflex/analysis.py,sha256
|
|
1
|
+
pythonflex/__init__.py,sha256=rW_MdM0ijaN9HAzFL-P7oVfLRJRGq0M83Izfmdr_s64,1346
|
|
2
|
+
pythonflex/analysis.py,sha256=cGUZKb5swJ7h1f6PJcIg4AW-BulY2ETGjJul9cg3u-k,57217
|
|
3
3
|
pythonflex/logging_config.py,sha256=iqRKK18zvtfV_-bYHWrXtSZywiUtYxoHkw0ZnVORQBQ,2015
|
|
4
|
-
pythonflex/plotting.py,sha256=
|
|
4
|
+
pythonflex/plotting.py,sha256=ywUa95UxUaxtkaWGffrCcaQWw1WCF17GF6nF8KgWhc0,27429
|
|
5
5
|
pythonflex/preprocessing.py,sha256=oIGPdmETuBQh4mdsIOWB5DOmYndp9S-sW8r7h_ek0Zo,8583
|
|
6
6
|
pythonflex/utils.py,sha256=nyVlGu5OXpz5YPj48hXueL5ja88sQ2PUiJ76c4USg4A,3886
|
|
7
7
|
pythonflex/data/dataset/liver_cell_lines_500_genes.csv,sha256=qfKsqPjL41Y1GuxxAhc-MfaNO0mX6Qju_SeynKSpEiM,238639
|
|
@@ -13,9 +13,10 @@ pythonflex/data/gold_standard/PATHWAY.parquet,sha256=bFRDe3PQ_TFc7B1uZuynwOGcgxE
|
|
|
13
13
|
pythonflex/data/gold_standard/corum.csv,sha256=2rZeyr2Ghm7f-gFxCZnhPtxI2jxRoiZMUEH2EJwAgsI,208889
|
|
14
14
|
pythonflex/data/gold_standard/gobp.csv,sha256=TO9yfx9mO8WkXvWfSB-pFId9T8xYfqdZpshAXC0Fyj8,1739167
|
|
15
15
|
pythonflex/data/gold_standard/pathway.csv,sha256=J3HKVLUZ_Oxucmn_14ieYp3Wr2lcKtp0nIl4_8_K2Yc,489424
|
|
16
|
-
pythonflex/examples/basic_usage.py,sha256=
|
|
16
|
+
pythonflex/examples/basic_usage.py,sha256=4Kv3OdiyBruq30Ppwx2xYx1ioEtl8jeAg6mAJxzA6Go,1919
|
|
17
17
|
pythonflex/examples/dataset_filtering.py,sha256=56ZXgsbUNaHoGjX8QdQZ74CjUXDi-qdzfeMhmP1WHAA,978
|
|
18
|
-
pythonflex
|
|
19
|
-
pythonflex-0.
|
|
20
|
-
pythonflex-0.
|
|
21
|
-
pythonflex-0.
|
|
18
|
+
pythonflex/examples/test.py,sha256=B8-JE5AU7be5loSr6Qv2rOviXXe1NRCYpaEGfGjaow0,2388
|
|
19
|
+
pythonflex-0.2.dist-info/METADATA,sha256=8URwIkDildA8Hh-WvsGwIywP2ssMCrp1-Z2zIdYckRM,3926
|
|
20
|
+
pythonflex-0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
21
|
+
pythonflex-0.2.dist-info/entry_points.txt,sha256=37liK1baI_CRVDivpjsn8JDClL9_YeTTuSMAZ3Ty7oE,47
|
|
22
|
+
pythonflex-0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|