pythonflex 0.1.5__tar.gz → 0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pythonflex-0.2/.vscode/settings.json +5 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/PKG-INFO +1 -1
- {pythonflex-0.1.5 → pythonflex-0.2}/pyproject.toml +1 -1
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/__init__.py +2 -2
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/analysis.py +182 -85
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/examples/basic_usage.py +15 -35
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/examples/dataset_filtering.py +2 -0
- pythonflex-0.2/src/pythonflex/examples/test.py +104 -0
- pythonflex-0.2/src/pythonflex/plotting.py +672 -0
- pythonflex-0.1.5/src/pythonflex/plotting.py +0 -510
- {pythonflex-0.1.5 → pythonflex-0.2}/.gitignore +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/.python-version +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/README.md +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/data/dataset/liver_cell_lines_500_genes.csv +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/data/dataset/melanoma_cell_lines_500_genes.csv +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/data/dataset/neuroblastoma_cell_lines_500_genes.csv +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/data/gold_standard/CORUM.parquet +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/data/gold_standard/GOBP.parquet +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/data/gold_standard/PATHWAY.parquet +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/data/gold_standard/corum.csv +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/data/gold_standard/gobp.csv +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/data/gold_standard/pathway.csv +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/logging_config.py +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/preprocessing.py +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/src/pythonflex/utils.py +0 -0
- {pythonflex-0.1.5 → pythonflex-0.2}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pythonflex
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2
|
|
4
4
|
Summary: pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data.
|
|
5
5
|
Author-email: Yasir Demirtaş <tyasird@hotmail.com>
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "pythonflex"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2"
|
|
4
4
|
description = "pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
from .logging_config import log
|
|
3
3
|
from .utils import dsave, dload
|
|
4
4
|
from .preprocessing import get_example_data_path, load_datasets, get_common_genes, filter_matrix_by_genes, load_gold_standard, filter_duplicate_terms
|
|
5
|
-
from .analysis import initialize, pra, pra_percomplex, fast_corr, perform_corr, is_symmetric, binary, has_mirror_of_first_pair, convert_full_to_half_matrix, drop_mirror_pairs, quick_sort, complex_contributions, save_results_to_csv
|
|
5
|
+
from .analysis import initialize, pra, pra_percomplex, fast_corr, perform_corr, is_symmetric, binary, has_mirror_of_first_pair, convert_full_to_half_matrix, drop_mirror_pairs, quick_sort, complex_contributions, save_results_to_csv, update_matploblib_config
|
|
6
6
|
from .plotting import (
|
|
7
7
|
adjust_text_positions, plot_precision_recall_curve, plot_percomplex_scatter,
|
|
8
8
|
plot_percomplex_scatter_bysize, plot_complex_contributions, plot_significant_complexes, plot_auc_scores
|
|
@@ -14,5 +14,5 @@ __all__ = [ "log", "get_example_data_path", "fast_corr",
|
|
|
14
14
|
"perform_corr", "is_symmetric", "binary", "has_mirror_of_first_pair", "convert_full_to_half_matrix",
|
|
15
15
|
"drop_mirror_pairs", "quick_sort", "complex_contributions", "adjust_text_positions", "plot_precision_recall_curve",
|
|
16
16
|
"plot_percomplex_scatter", "plot_percomplex_scatter_bysize", "plot_complex_contributions",
|
|
17
|
-
"plot_significant_complexes", "plot_auc_scores", "save_results_to_csv"
|
|
17
|
+
"plot_significant_complexes", "plot_auc_scores", "save_results_to_csv", "update_matploblib_config"
|
|
18
18
|
]
|
|
@@ -23,7 +23,7 @@ from .logging_config import log
|
|
|
23
23
|
from .preprocessing import filter_matrix_by_genes
|
|
24
24
|
from .utils import dsave, dload, _sanitize
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
import matplotlib as mpl
|
|
27
27
|
|
|
28
28
|
def deep_update(source, overrides):
|
|
29
29
|
"""Recursively update the source dict with the overrides."""
|
|
@@ -40,7 +40,7 @@ def initialize(config={}):
|
|
|
40
40
|
|
|
41
41
|
default_config = {
|
|
42
42
|
"min_genes_in_complex": 3,
|
|
43
|
-
"min_genes_per_complex_analysis":
|
|
43
|
+
"min_genes_per_complex_analysis": 2,
|
|
44
44
|
"output_folder": "output",
|
|
45
45
|
"gold_standard": "CORUM",
|
|
46
46
|
"color_map": "RdYlBu",
|
|
@@ -48,7 +48,7 @@ def initialize(config={}):
|
|
|
48
48
|
"plotting": {
|
|
49
49
|
"save_plot": True,
|
|
50
50
|
"show_plot": True,
|
|
51
|
-
"output_type": "
|
|
51
|
+
"output_type": "pdf",
|
|
52
52
|
},
|
|
53
53
|
"preprocessing": {
|
|
54
54
|
"normalize": False,
|
|
@@ -95,31 +95,105 @@ def initialize(config={}):
|
|
|
95
95
|
|
|
96
96
|
|
|
97
97
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
'
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def update_matploblib_config(config=None, font_family="Arial", layout="single"):
|
|
101
|
+
"""
|
|
102
|
+
Configure matplotlib settings optimized for Nature journal figures:
|
|
103
|
+
- 7 pt fonts (labels, ticks, legend), 9 pt titles
|
|
104
|
+
- Thin spines (0.5 pt), ticks out (left/bottom only), no minor ticks
|
|
105
|
+
- No grid, clean minimalist look
|
|
106
|
+
- Colorblind-friendly Tableau 10 color cycle
|
|
107
|
+
- Illustrator-safe PDF export (Type 42)
|
|
108
|
+
- Figure sizes: "single" (~89 mm), "double" (~183 mm), or custom (width, height) in inches
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
config (dict, optional): Configuration dict (e.g., {'color_map': 'RdYlBu'}).
|
|
112
|
+
font_family (str): Preferred font (e.g., 'Arial', falls back to 'Helvetica').
|
|
113
|
+
layout (str or tuple): 'single' (~89 mm), 'double' (~183 mm), or (width, height) in inches.
|
|
114
|
+
"""
|
|
115
|
+
if config is None:
|
|
116
|
+
config = {}
|
|
117
|
+
# Fallback if chosen font missing
|
|
118
|
+
try:
|
|
119
|
+
from matplotlib.font_manager import findfont, FontProperties
|
|
120
|
+
findfont(FontProperties(family=font_family))
|
|
121
|
+
except Exception:
|
|
122
|
+
font_family = "Helvetica" # Nature prefers Helvetica if Arial unavailable
|
|
123
|
+
print(f"Warning: '{font_family}' not found, falling back to 'Helvetica'.")
|
|
124
|
+
|
|
125
|
+
# Figure size presets (Nature: single ≈ 89 mm, double ≈ 183 mm at 25.4 mm/inch)
|
|
126
|
+
if isinstance(layout, tuple):
|
|
127
|
+
fig_w, fig_h = layout
|
|
128
|
+
else:
|
|
129
|
+
if layout == "double":
|
|
130
|
+
fig_w = 7.2 # ~183 mm
|
|
131
|
+
fig_h = 5.4 # Adjusted aspect
|
|
132
|
+
else: # "single"
|
|
133
|
+
fig_w = 4.0 # Increased from 3.5" for more space (~102 mm)
|
|
134
|
+
fig_h = 3.0 # Increased from 2.6" for better aspect (~76 mm)
|
|
135
|
+
# Colorblind-friendly cycle (Tableau 10 adapted)
|
|
136
|
+
cb_cycle = [
|
|
137
|
+
"#4E79A7", "#F28E2B", "#E15759", "#76B7B2", "#59A14F",
|
|
138
|
+
"#EDC948", "#B07AA1", "#FF9DA7", "#9C755F", "#BAB0AC"
|
|
139
|
+
]
|
|
140
|
+
mpl.rcParams.update({
|
|
141
|
+
# --- Text & Fonts ---
|
|
142
|
+
"text.usetex": False, # Avoid LaTeX
|
|
143
|
+
"font.family": [font_family], # Explicit font
|
|
144
|
+
"mathtext.fontset": "dejavusans", # Disable mathtext
|
|
145
|
+
"mathtext.default": "regular", # Plain text
|
|
146
|
+
"axes.unicode_minus": True, # Proper minus signs
|
|
147
|
+
# --- Sizes (7 pt baseline, adjusted for space) ---
|
|
148
|
+
"font.size": 7, # Reduced from 8 pt
|
|
149
|
+
"axes.titlesize": 9, # Reduced from 10 pt
|
|
150
|
+
"axes.labelsize": 7,
|
|
151
|
+
"legend.fontsize": 7,
|
|
152
|
+
"xtick.labelsize": 7,
|
|
153
|
+
"ytick.labelsize": 7,
|
|
154
|
+
# --- Lines & Markers ---
|
|
155
|
+
"lines.linewidth": 1.5, # Kept for data visibility
|
|
156
|
+
"lines.markersize": 4.0,
|
|
157
|
+
"patch.linewidth": 0.5,
|
|
158
|
+
"errorbar.capsize": 2,
|
|
159
|
+
# --- Axes, Spines, Ticks ---
|
|
160
|
+
"axes.linewidth": 0.5,
|
|
161
|
+
"axes.edgecolor": "black",
|
|
162
|
+
"axes.facecolor": "none",
|
|
163
|
+
"axes.titlepad": 3.0,
|
|
164
|
+
"axes.labelpad": 2.0,
|
|
165
|
+
"axes.prop_cycle": mpl.cycler(color=cb_cycle),
|
|
166
|
+
"xtick.direction": "out",
|
|
167
|
+
"ytick.direction": "out",
|
|
168
|
+
"xtick.major.size": 2.5,
|
|
169
|
+
"ytick.major.size": 2.5,
|
|
170
|
+
"xtick.minor.visible": False,
|
|
171
|
+
"ytick.minor.visible": False,
|
|
172
|
+
"xtick.major.width": 0.5,
|
|
173
|
+
"ytick.major.width": 0.5,
|
|
174
|
+
"xtick.top": False,
|
|
175
|
+
"ytick.right": False,
|
|
176
|
+
# --- Grid ---
|
|
177
|
+
"axes.grid": False,
|
|
178
|
+
# --- Legend ---
|
|
179
|
+
"legend.frameon": False,
|
|
180
|
+
"legend.handlelength": 1.6, # Slightly adjusted
|
|
181
|
+
"legend.handletextpad": 0.4,
|
|
182
|
+
"legend.borderaxespad": 0.3,
|
|
183
|
+
"legend.loc": "best", # Dynamic placement to avoid overlap
|
|
184
|
+
# --- Figure & Save ---
|
|
185
|
+
"figure.dpi": 600,
|
|
186
|
+
"figure.figsize": (fig_w, fig_h),
|
|
187
|
+
"savefig.dpi": 600,
|
|
188
|
+
"savefig.bbox": "tight",
|
|
189
|
+
"savefig.pad_inches": 0.1, # Increased for spacing
|
|
190
|
+
"savefig.transparent": False, # White background
|
|
191
|
+
# --- PDF/SVG Export ---
|
|
192
|
+
"pdf.fonttype": 42,
|
|
193
|
+
"ps.fonttype": 42,
|
|
194
|
+
"pdf.use14corefonts": False,
|
|
195
|
+
"svg.fonttype": "none",
|
|
121
196
|
})
|
|
122
|
-
log.done("Matplotlib settings updated.")
|
|
123
197
|
|
|
124
198
|
|
|
125
199
|
|
|
@@ -172,15 +246,14 @@ def pra(dataset_name, matrix, is_corr=False):
|
|
|
172
246
|
pr_auc = metrics.auc(recall, precision)
|
|
173
247
|
df["precision"] = precision
|
|
174
248
|
df["recall"] = recall
|
|
175
|
-
|
|
249
|
+
|
|
176
250
|
log.info(f"PR-AUC: {pr_auc:.4f}, Number of true positives: {df['prediction'].sum()}")
|
|
177
251
|
dsave(df, "pra", dataset_name)
|
|
178
252
|
dsave(pr_auc, "pr_auc", dataset_name)
|
|
179
|
-
|
|
180
|
-
return df, pr_auc
|
|
181
|
-
|
|
182
|
-
|
|
253
|
+
dsave( _corrected_auc(df) , "corrected_pr_auc", dataset_name)
|
|
183
254
|
|
|
255
|
+
log.done(f"Global PRA completed for {dataset_name}")
|
|
256
|
+
return df
|
|
184
257
|
|
|
185
258
|
|
|
186
259
|
|
|
@@ -189,6 +262,9 @@ def pra(dataset_name, matrix, is_corr=False):
|
|
|
189
262
|
# helper functions for PRA per-complex analysis
|
|
190
263
|
# --------------------------------------------------------------------------
|
|
191
264
|
|
|
265
|
+
def _corrected_auc(df: pd.DataFrame) -> float:
|
|
266
|
+
return np.trapz(df["precision"], df["recall"]) - df["precision"].iloc[-1]
|
|
267
|
+
|
|
192
268
|
def _build_gene_to_pair_indices(pairwise_df):
|
|
193
269
|
indices = pairwise_df.index.values
|
|
194
270
|
genes = pd.concat([pairwise_df['gene1'], pairwise_df['gene2']], ignore_index=True)
|
|
@@ -240,10 +316,15 @@ def _dump_pairwise_memmap(df: pd.DataFrame, tag: str) -> Path:
|
|
|
240
316
|
|
|
241
317
|
|
|
242
318
|
|
|
243
|
-
|
|
319
|
+
# Global variables for worker processes (compatible with older joblib)
|
|
320
|
+
PAIRWISE_DF = None
|
|
321
|
+
GENE2IDX = None
|
|
322
|
+
|
|
323
|
+
def _init_worker_globals(memmap_path, gene_to_pair_indices):
|
|
324
|
+
"""Initialize global variables for worker processes"""
|
|
244
325
|
global PAIRWISE_DF, GENE2IDX
|
|
245
326
|
PAIRWISE_DF = load(memmap_path)
|
|
246
|
-
GENE2IDX
|
|
327
|
+
GENE2IDX = gene_to_pair_indices
|
|
247
328
|
|
|
248
329
|
|
|
249
330
|
|
|
@@ -263,42 +344,52 @@ def delete_memmap(memmap_path, log, wait_seconds=0.1):
|
|
|
263
344
|
# --------------------------------------------------------------------------
|
|
264
345
|
# Process each chunk of terms
|
|
265
346
|
# --------------------------------------------------------------------------
|
|
266
|
-
def _process_chunk(chunk_terms, min_genes):
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
347
|
+
def _process_chunk(chunk_terms, min_genes, memmap_path, gene_to_pair_indices):
|
|
348
|
+
"""Process a chunk of terms - compatible with older joblib versions"""
|
|
349
|
+
try:
|
|
350
|
+
# Load data in each worker (compatible with older joblib)
|
|
351
|
+
pairwise_df = load(memmap_path)
|
|
352
|
+
local_auc_scores = {}
|
|
353
|
+
local_corrected_auc_scores = {}
|
|
354
|
+
|
|
355
|
+
for idx, row in chunk_terms.iterrows():
|
|
356
|
+
gene_set = set(row.used_genes)
|
|
357
|
+
if len(gene_set) < min_genes:
|
|
358
|
+
continue
|
|
275
359
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
360
|
+
candidate_indices = bitarray(len(pairwise_df))
|
|
361
|
+
for g in gene_set:
|
|
362
|
+
if g in gene_to_pair_indices:
|
|
363
|
+
candidate_indices[gene_to_pair_indices[g]] = True
|
|
364
|
+
if not candidate_indices.any():
|
|
365
|
+
continue
|
|
282
366
|
|
|
283
|
-
|
|
284
|
-
|
|
367
|
+
selected = np.unpackbits(candidate_indices).view(bool)[:len(pairwise_df)]
|
|
368
|
+
sub_df = pairwise_df.iloc[selected]
|
|
285
369
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
370
|
+
complex_id = str(idx)
|
|
371
|
+
pattern = r'(?:^|;)' + re.escape(complex_id) + r'(?:;|$)'
|
|
372
|
+
true_label = sub_df["complex_ids"].str.contains(pattern, regex=True).astype(int)
|
|
373
|
+
mask = (sub_df["complex_ids"] == "") | (true_label == 1)
|
|
374
|
+
preds = true_label[mask]
|
|
291
375
|
|
|
292
|
-
|
|
293
|
-
|
|
376
|
+
if preds.sum() == 0:
|
|
377
|
+
continue
|
|
294
378
|
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
379
|
+
tp_cum = preds.cumsum()
|
|
380
|
+
precision = tp_cum / (np.arange(len(preds)) + 1)
|
|
381
|
+
recall = tp_cum / tp_cum.iloc[-1]
|
|
382
|
+
if len(recall) >= 2 and recall.iloc[-1] != 0:
|
|
383
|
+
# Compute regular AUC
|
|
384
|
+
local_auc_scores[idx] = metrics.auc(recall, precision)
|
|
385
|
+
# Compute corrected AUC using the same logic as _corrected_auc function
|
|
386
|
+
local_corrected_auc_scores[idx] = np.trapz(precision, recall) - precision.iloc[-1]
|
|
300
387
|
|
|
301
|
-
|
|
388
|
+
return {'auc': local_auc_scores, 'corrected_auc': local_corrected_auc_scores}
|
|
389
|
+
|
|
390
|
+
except Exception as e:
|
|
391
|
+
# Return error info for debugging
|
|
392
|
+
return {'error': str(e), 'chunk_size': len(chunk_terms)}
|
|
302
393
|
|
|
303
394
|
|
|
304
395
|
|
|
@@ -345,26 +436,23 @@ def pra_percomplex(dataset_name, matrix, is_corr=False, chunk_size=200):
|
|
|
345
436
|
results = None
|
|
346
437
|
|
|
347
438
|
try:
|
|
348
|
-
#
|
|
439
|
+
# Compatible parallel execution for older joblib versions
|
|
349
440
|
log.started("Processing chunks in parallel")
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
)(delayed(_process_chunk)(chunk, min_genes) for chunk in chunks)
|
|
360
|
-
|
|
361
|
-
# Update progress bar once all tasks are complete
|
|
362
|
-
pbar.update(len(chunks))
|
|
441
|
+
|
|
442
|
+
# Use a more conservative approach with older joblib
|
|
443
|
+
results = Parallel(
|
|
444
|
+
n_jobs=min(4, len(chunks)), # Limit to 4 workers or number of chunks
|
|
445
|
+
temp_folder=os.path.dirname(memmap_path),
|
|
446
|
+
max_nbytes='100M', # Set memory limit
|
|
447
|
+
verbose=1 # Show progress
|
|
448
|
+
)(delayed(_process_chunk)(chunk, min_genes, memmap_path, gene_to_pair_indices)
|
|
449
|
+
for chunk in tqdm(chunks, desc="Per-complex PRA"))
|
|
363
450
|
|
|
364
451
|
log.done("Processing chunks in parallel")
|
|
365
452
|
|
|
366
453
|
except Exception as e:
|
|
367
454
|
log.error(f"Error during parallel processing: {e}")
|
|
455
|
+
log.error(f"Error type: {type(e).__name__}")
|
|
368
456
|
# Still try to clean up the memmap file
|
|
369
457
|
try:
|
|
370
458
|
if os.path.exists(memmap_path):
|
|
@@ -383,19 +471,29 @@ def pra_percomplex(dataset_name, matrix, is_corr=False, chunk_size=200):
|
|
|
383
471
|
except OSError as e:
|
|
384
472
|
log.warning(f"Failed to remove memmap file {memmap_path}: {e}")
|
|
385
473
|
|
|
386
|
-
# Merge results with error handling
|
|
474
|
+
# Merge results with enhanced error handling
|
|
387
475
|
auc_scores = {}
|
|
476
|
+
corrected_auc_scores = {}
|
|
388
477
|
if results:
|
|
389
|
-
for res in results:
|
|
478
|
+
for i, res in enumerate(results):
|
|
390
479
|
if isinstance(res, dict):
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
480
|
+
if 'error' in res:
|
|
481
|
+
log.error(f"Error in chunk {i}: {res['error']}")
|
|
482
|
+
elif 'auc' in res and 'corrected_auc' in res:
|
|
483
|
+
# New format with both AUC types
|
|
484
|
+
auc_scores.update(res['auc'])
|
|
485
|
+
corrected_auc_scores.update(res['corrected_auc'])
|
|
486
|
+
else:
|
|
487
|
+
# Fallback for old format (backward compatibility)
|
|
488
|
+
auc_scores.update(res)
|
|
489
|
+
elif isinstance(res, tuple) and len(res) >= 2 and res[0] is None:
|
|
490
|
+
log.error(f"Chunk {i} error: {res[1]}")
|
|
394
491
|
else:
|
|
395
|
-
log.
|
|
492
|
+
log.warning(f"Unexpected result type from chunk {i}: {type(res)} - {res}")
|
|
396
493
|
|
|
397
494
|
# Add the computed AUC scores to the terms DataFrame.
|
|
398
495
|
terms["auc_score"] = pd.Series(auc_scores)
|
|
496
|
+
terms["corrected_auc_score"] = pd.Series(corrected_auc_scores)
|
|
399
497
|
terms.drop(columns=["hash"], inplace=True)
|
|
400
498
|
dsave(terms, "pra_percomplex", dataset_name)
|
|
401
499
|
log.done(f"Per-complex PRA completed.")
|
|
@@ -1296,4 +1394,3 @@ def save_results_to_csv(categories = ["complex_contributions", "pr_auc", "pra_pe
|
|
|
1296
1394
|
# dsave(pr_auc, "pr_auc", dataset_name)
|
|
1297
1395
|
# log.done(f"Global PRA completed for {dataset_name}")
|
|
1298
1396
|
# return df, pr_auc
|
|
1299
|
-
|
|
@@ -6,18 +6,22 @@ Demonstrates initialization, data loading, analysis, and plotting.
|
|
|
6
6
|
import pythonflex as flex
|
|
7
7
|
|
|
8
8
|
inputs = {
|
|
9
|
-
"
|
|
10
|
-
"path":
|
|
9
|
+
"Melanoma (63 Screens)": {
|
|
10
|
+
"path": flex.get_example_data_path("melanoma_cell_lines_500_genes.csv"),
|
|
11
11
|
"sort": "high"
|
|
12
12
|
},
|
|
13
|
-
"
|
|
14
|
-
"path":
|
|
13
|
+
"Liver (24 Screens)": {
|
|
14
|
+
"path": flex.get_example_data_path("liver_cell_lines_500_genes.csv"),
|
|
15
15
|
"sort": "high"
|
|
16
|
-
}
|
|
16
|
+
},
|
|
17
|
+
"Neuroblastoma (37 Screens)": {
|
|
18
|
+
"path": flex.get_example_data_path("neuroblastoma_cell_lines_500_genes.csv"),
|
|
19
|
+
"sort": "high"
|
|
20
|
+
},
|
|
17
21
|
}
|
|
18
22
|
|
|
19
|
-
#%%
|
|
20
23
|
|
|
24
|
+
#%%
|
|
21
25
|
default_config = {
|
|
22
26
|
"min_genes_in_complex": 0,
|
|
23
27
|
"min_genes_per_complex_analysis": 3,
|
|
@@ -27,7 +31,7 @@ default_config = {
|
|
|
27
31
|
"jaccard": True,
|
|
28
32
|
"plotting": {
|
|
29
33
|
"save_plot": True,
|
|
30
|
-
"output_type": "
|
|
34
|
+
"output_type": "pdf",
|
|
31
35
|
},
|
|
32
36
|
"preprocessing": {
|
|
33
37
|
"fill_na": True,
|
|
@@ -43,7 +47,6 @@ default_config = {
|
|
|
43
47
|
flex.initialize(default_config)
|
|
44
48
|
|
|
45
49
|
# Load datasets and gold standard terms
|
|
46
|
-
|
|
47
50
|
data, _ = flex.load_datasets(inputs)
|
|
48
51
|
terms, genes_in_terms = flex.load_gold_standard()
|
|
49
52
|
|
|
@@ -51,16 +54,17 @@ terms, genes_in_terms = flex.load_gold_standard()
|
|
|
51
54
|
#%%
|
|
52
55
|
# Run analysis
|
|
53
56
|
for name, dataset in data.items():
|
|
54
|
-
|
|
55
|
-
fpc = flex.pra_percomplex(name, dataset, is_corr=
|
|
57
|
+
pra = flex.pra(name, dataset, is_corr=False)
|
|
58
|
+
fpc = flex.pra_percomplex(name, dataset, is_corr=False)
|
|
56
59
|
cc = flex.complex_contributions(name)
|
|
60
|
+
|
|
57
61
|
|
|
58
62
|
|
|
59
63
|
#%%
|
|
60
64
|
# Generate plots
|
|
61
65
|
flex.plot_auc_scores()
|
|
62
66
|
flex.plot_precision_recall_curve()
|
|
63
|
-
flex.plot_percomplex_scatter()
|
|
67
|
+
flex.plot_percomplex_scatter(n_top=20)
|
|
64
68
|
flex.plot_percomplex_scatter_bysize()
|
|
65
69
|
flex.plot_significant_complexes()
|
|
66
70
|
flex.plot_complex_contributions()
|
|
@@ -82,27 +86,3 @@ flex.save_results_to_csv()
|
|
|
82
86
|
|
|
83
87
|
|
|
84
88
|
|
|
85
|
-
# %%
|
|
86
|
-
import os
|
|
87
|
-
import glob
|
|
88
|
-
|
|
89
|
-
inputs = {
|
|
90
|
-
"depmap all": {
|
|
91
|
-
"path": "../../../../datasets/depmap/24Q4/depmap_geneeffect_all_cellines.csv",
|
|
92
|
-
"sort": "high"
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
# Now auto-discover the rest of the CSVs in the folder
|
|
97
|
-
DATA_DIR = "../../../../datasets/depmap/24Q4/subset/"
|
|
98
|
-
for path in glob.glob(os.path.join(DATA_DIR, "*.csv")):
|
|
99
|
-
|
|
100
|
-
# Derive the key name from filename (without extension)
|
|
101
|
-
key = os.path.splitext(os.path.basename(path))[0]
|
|
102
|
-
inputs[key] = {
|
|
103
|
-
"path": path,
|
|
104
|
-
"sort": "high"
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
# inputs now has "depmap all" first, then one entry per CSV in DATA_DIR
|
|
108
|
-
print(inputs)
|
|
@@ -8,6 +8,8 @@ model = pd.read_csv("../../../../datasets/depmap/24Q4/Model.csv",index_col=0)
|
|
|
8
8
|
df.columns = df.columns.str.split(" \\(").str[0]
|
|
9
9
|
df = df.T
|
|
10
10
|
|
|
11
|
+
#%%
|
|
12
|
+
|
|
11
13
|
# %%
|
|
12
14
|
# get ModelID of selected disease for example OncotreePrimaryDisease==Melanoma
|
|
13
15
|
melanoma = model[model.OncotreePrimaryDisease=="Melanoma"].index.unique().values
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
#%%
|
|
2
|
+
import pythonflex as flex
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# # Define specific cell line types you're interested in
|
|
6
|
+
DATA_DIR = "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/subset/"
|
|
7
|
+
|
|
8
|
+
# Specific cell lines of interest with "_cell_lines" suffix removed
|
|
9
|
+
cell_line_files = [
|
|
10
|
+
"soft_tissue_cell_lines.csv",
|
|
11
|
+
"skin_cell_lines.csv",
|
|
12
|
+
# "lung_cell_lines.csv",
|
|
13
|
+
# "head_and_neck_cell_lines.csv",
|
|
14
|
+
# "esophagus_stomach_cell_lines.csv",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
inputs = {}
|
|
18
|
+
|
|
19
|
+
# Create inputs dict with shortened names (removing "_cell_lines" suffix)
|
|
20
|
+
for filename in cell_line_files:
|
|
21
|
+
# Remove .csv extension and _cell_lines suffix
|
|
22
|
+
key = filename.replace("_cell_lines.csv", "")
|
|
23
|
+
full_path = os.path.join(DATA_DIR, filename)
|
|
24
|
+
|
|
25
|
+
inputs[key] = {
|
|
26
|
+
"path": full_path,
|
|
27
|
+
"sort": "high"
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
inputs['depmap'] = {
|
|
31
|
+
"path": "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv",
|
|
32
|
+
"sort": "high"
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Print the resulting inputs dictionary
|
|
36
|
+
print("Configured inputs:")
|
|
37
|
+
for key, value in inputs.items():
|
|
38
|
+
print(f" {key}: {value['path']}")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
default_config = {
|
|
43
|
+
"min_genes_in_complex": 2,
|
|
44
|
+
"min_genes_per_complex_analysis": 2,
|
|
45
|
+
"output_folder": "25q2_min_genes_2",
|
|
46
|
+
"gold_standard": "CORUM",
|
|
47
|
+
"color_map": "RdYlBu",
|
|
48
|
+
"jaccard": True,
|
|
49
|
+
"plotting": {
|
|
50
|
+
"save_plot": True,
|
|
51
|
+
"output_type": "pdf",
|
|
52
|
+
},
|
|
53
|
+
"preprocessing": {
|
|
54
|
+
"fill_na": True,
|
|
55
|
+
"normalize": False,
|
|
56
|
+
},
|
|
57
|
+
"corr_function": "numpy",
|
|
58
|
+
"logging": {
|
|
59
|
+
"visible_levels": ["DONE","STARTED"] # "PROGRESS", "STARTED", ,"INFO","WARNING"
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Initialize logger, config, and output folder
|
|
64
|
+
flex.initialize(default_config)
|
|
65
|
+
|
|
66
|
+
# Load datasets and gold standard terms
|
|
67
|
+
data, _ = flex.load_datasets(inputs)
|
|
68
|
+
terms, genes_in_terms = flex.load_gold_standard()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
#%%
|
|
72
|
+
# Run analysis
|
|
73
|
+
for name, dataset in data.items():
|
|
74
|
+
pra = flex.pra(name, dataset, is_corr=False)
|
|
75
|
+
fpc = flex.pra_percomplex(name, dataset, is_corr=False)
|
|
76
|
+
cc = flex.complex_contributions(name)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
#%%
|
|
81
|
+
# Generate plots
|
|
82
|
+
flex.plot_auc_scores()
|
|
83
|
+
flex.plot_precision_recall_curve()
|
|
84
|
+
flex.plot_percomplex_scatter()
|
|
85
|
+
flex.plot_percomplex_scatter_bysize()
|
|
86
|
+
flex.plot_significant_complexes()
|
|
87
|
+
flex.plot_complex_contributions()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
#%%
|
|
91
|
+
# Save results to CSV
|
|
92
|
+
flex.save_results_to_csv()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
#%%
|
|
103
|
+
|
|
104
|
+
|