pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pythonflex/__init__.py +28 -4
- pythonflex/analysis.py +287 -579
- pythonflex/examples/basic_usage.py +38 -30
- pythonflex/examples/manuscript.py +37 -43
- pythonflex/examples/runtime/runtime_benchmark.py +218 -0
- pythonflex/examples/runtime/runtime_benchmark_10_runs_memmap.py +534 -0
- pythonflex/examples/runtime/runtime_benchmark_corum_njobs.py +245 -0
- pythonflex/examples/runtime/runtime_benchmark_gobp_njobs_chunks.py +319 -0
- pythonflex/examples/runtime/runtime_benchmark_gobp_optimization.py +417 -0
- pythonflex/examples/runtime/runtime_benchmark_repeated.py +347 -0
- pythonflex/old_functions.py +422 -0
- pythonflex/plotting.py +655 -242
- pythonflex/preprocessing.py +54 -216
- pythonflex/utils.py +36 -9
- {pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/METADATA +8 -6
- pythonflex-0.4.dist-info/RECORD +32 -0
- {pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/WHEEL +1 -1
- pythonflex-0.4.dist-info/licenses/LICENSE +7 -0
- pythonflex-0.3.4.dist-info/RECORD +0 -24
- {pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/entry_points.txt +0 -0
pythonflex/preprocessing.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import numpy as np
|
|
4
|
-
from .utils import dsave, dload
|
|
4
|
+
from .utils import dsave, dload, normalize_analysis_genes
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
from .logging_config import log
|
|
7
7
|
tqdm.pandas()
|
|
@@ -68,7 +68,16 @@ def _load_file(filepath, ext):
|
|
|
68
68
|
def load_datasets(files, continue_with_common_genes=False):
|
|
69
69
|
config = dload("config")
|
|
70
70
|
preprocessing = config["preprocessing"]
|
|
71
|
-
|
|
71
|
+
analysis_genes_raw = config.get("analysis_genes", "")
|
|
72
|
+
analysis_genes_missing = (
|
|
73
|
+
analysis_genes_raw is None or str(analysis_genes_raw).strip() == ""
|
|
74
|
+
)
|
|
75
|
+
analysis_genes = normalize_analysis_genes(
|
|
76
|
+
analysis_genes_raw,
|
|
77
|
+
legacy_use_common_genes=(
|
|
78
|
+
config.get("use_common_genes") if analysis_genes_missing else None
|
|
79
|
+
),
|
|
80
|
+
)
|
|
72
81
|
data_dict= {}
|
|
73
82
|
|
|
74
83
|
for filename, meta in files.items():
|
|
@@ -103,14 +112,16 @@ def load_datasets(files, continue_with_common_genes=False):
|
|
|
103
112
|
|
|
104
113
|
common_genes = get_common_genes(data_dict)
|
|
105
114
|
|
|
106
|
-
# Apply common gene filtering only
|
|
107
|
-
if
|
|
108
|
-
log.info(f"Applying common gene filtering: {len(common_genes)} genes")
|
|
109
|
-
for filename, df in data_dict.items():
|
|
110
|
-
if df.index.isin(common_genes).any():
|
|
111
|
-
data_dict[filename] = df.loc[common_genes]
|
|
112
|
-
|
|
113
|
-
log.info(
|
|
115
|
+
# Apply common gene filtering only when analysis_genes='shared' (or forced by arg)
|
|
116
|
+
if analysis_genes == "shared" or continue_with_common_genes:
|
|
117
|
+
log.info(f"Applying common gene filtering: {len(common_genes)} genes")
|
|
118
|
+
for filename, df in data_dict.items():
|
|
119
|
+
if df.index.isin(common_genes).any():
|
|
120
|
+
data_dict[filename] = df.loc[common_genes]
|
|
121
|
+
else:
|
|
122
|
+
log.info(
|
|
123
|
+
f"Skipping common gene filtering (analysis_genes='dataset_specific'). Common genes found: {len(common_genes)}"
|
|
124
|
+
)
|
|
114
125
|
|
|
115
126
|
dsave({
|
|
116
127
|
"datasets": data_dict,
|
|
@@ -175,7 +186,7 @@ def filter_matrix_by_genes(matrix, genes_present_in_terms):
|
|
|
175
186
|
genes = matrix.index.intersection(genes_present_in_terms)
|
|
176
187
|
matrix = matrix.loc[genes, genes]
|
|
177
188
|
log.done(f"Filtering matrix: {matrix.shape}")
|
|
178
|
-
return matrix
|
|
189
|
+
return matrix
|
|
179
190
|
|
|
180
191
|
|
|
181
192
|
|
|
@@ -185,21 +196,24 @@ def load_gold_standard():
|
|
|
185
196
|
package_dir = return_package_dir()
|
|
186
197
|
data_dir_path = os.path.join(package_dir, 'data')
|
|
187
198
|
|
|
188
|
-
config = dload("config")
|
|
189
|
-
|
|
199
|
+
config = dload("config")
|
|
200
|
+
analysis_genes_raw = config.get("analysis_genes", "")
|
|
201
|
+
analysis_genes_missing = (
|
|
202
|
+
analysis_genes_raw is None or str(analysis_genes_raw).strip() == ""
|
|
203
|
+
)
|
|
204
|
+
analysis_genes = normalize_analysis_genes(
|
|
205
|
+
analysis_genes_raw,
|
|
206
|
+
legacy_use_common_genes=(
|
|
207
|
+
config.get("use_common_genes") if analysis_genes_missing else None
|
|
208
|
+
),
|
|
209
|
+
)
|
|
190
210
|
|
|
191
211
|
gold_standard_source = config['gold_standard']
|
|
192
212
|
jaccard_enabled = bool(config.get("jaccard", False))
|
|
193
|
-
jaccard_threshold_raw = config.get("jaccard_threshold", 1.0)
|
|
194
|
-
try:
|
|
195
|
-
jaccard_threshold = float(jaccard_threshold_raw) # type: ignore[arg-type]
|
|
196
|
-
except (TypeError, ValueError):
|
|
197
|
-
raise ValueError(
|
|
198
|
-
f"config['jaccard_threshold'] must be a number in (0, 1], got {jaccard_threshold_raw!r}"
|
|
199
|
-
)
|
|
200
213
|
log.done(
|
|
201
214
|
f"Loading gold standard: {gold_standard_source}, Min complex size: {config['min_genes_in_complex']}, "
|
|
202
|
-
f"Jaccard filtering: {jaccard_enabled} (
|
|
215
|
+
f"Jaccard filtering: {jaccard_enabled} (exact duplicate used_genes after dataset filtering), "
|
|
216
|
+
f"analysis_genes: {analysis_genes}"
|
|
203
217
|
)
|
|
204
218
|
|
|
205
219
|
# Define gold standard file paths for predefined sources
|
|
@@ -235,38 +249,6 @@ def load_gold_standard():
|
|
|
235
249
|
terms = terms[terms["n_all_genes"] >= config['min_genes_in_complex']]
|
|
236
250
|
log.done(f"After min_genes_in_complex filtering: {len(terms)} terms")
|
|
237
251
|
|
|
238
|
-
if jaccard_enabled:
|
|
239
|
-
if not (0.0 < jaccard_threshold <= 1.0):
|
|
240
|
-
raise ValueError(f"config['jaccard_threshold'] must be in (0, 1], got {jaccard_threshold}")
|
|
241
|
-
|
|
242
|
-
if jaccard_threshold >= 1.0:
|
|
243
|
-
log.done("Applying Jaccard filtering (threshold=1.0). Removing terms with identical gene sets.")
|
|
244
|
-
# Use all genes for jaccard filtering
|
|
245
|
-
terms["gene_set"] = terms["all_genes"].map(lambda x: frozenset(x))
|
|
246
|
-
grouped = terms.groupby("gene_set", sort=False)
|
|
247
|
-
duplicate_clusters = []
|
|
248
|
-
for _, group in grouped:
|
|
249
|
-
if len(group) > 1:
|
|
250
|
-
duplicate_clusters.append(group["ID"].values if "ID" in group.columns else group.index.values)
|
|
251
|
-
|
|
252
|
-
keep_ids = set(terms["ID"] if "ID" in terms.columns else terms.index)
|
|
253
|
-
for cluster in duplicate_clusters:
|
|
254
|
-
sorted_ids = sorted(cluster)
|
|
255
|
-
keep_ids.difference_update(sorted_ids[1:])
|
|
256
|
-
|
|
257
|
-
if "ID" in terms.columns:
|
|
258
|
-
terms = terms[terms["ID"].isin(keep_ids)].copy()
|
|
259
|
-
else:
|
|
260
|
-
terms = terms[terms.index.isin(keep_ids)].copy()
|
|
261
|
-
terms.drop(columns=["gene_set"], inplace=True, errors="ignore")
|
|
262
|
-
log.done(f"After Jaccard filtering: {len(terms)} terms")
|
|
263
|
-
else:
|
|
264
|
-
log.done(
|
|
265
|
-
f"Applying Jaccard filtering (threshold={jaccard_threshold}). Removing highly similar terms."
|
|
266
|
-
)
|
|
267
|
-
terms = _filter_terms_by_jaccard_threshold(terms, threshold=jaccard_threshold, genes_col="all_genes")
|
|
268
|
-
log.done(f"After Jaccard filtering: {len(terms)} terms")
|
|
269
|
-
|
|
270
252
|
# if there is column called "ID", set it as index
|
|
271
253
|
if "ID" in terms.columns:
|
|
272
254
|
terms = terms.set_index("ID")
|
|
@@ -276,177 +258,33 @@ def load_gold_standard():
|
|
|
276
258
|
return terms, None # Return None for genes_present_in_terms - will be computed per dataset
|
|
277
259
|
|
|
278
260
|
|
|
279
|
-
class _UnionFind:
|
|
280
|
-
def __init__(self, n: int):
|
|
281
|
-
self.parent = list(range(n))
|
|
282
|
-
self.rank = [0] * n
|
|
283
|
-
|
|
284
|
-
def find(self, x: int) -> int:
|
|
285
|
-
while self.parent[x] != x:
|
|
286
|
-
self.parent[x] = self.parent[self.parent[x]]
|
|
287
|
-
x = self.parent[x]
|
|
288
|
-
return x
|
|
289
|
-
|
|
290
|
-
def union(self, a: int, b: int) -> None:
|
|
291
|
-
ra, rb = self.find(a), self.find(b)
|
|
292
|
-
if ra == rb:
|
|
293
|
-
return
|
|
294
|
-
if self.rank[ra] < self.rank[rb]:
|
|
295
|
-
self.parent[ra] = rb
|
|
296
|
-
elif self.rank[ra] > self.rank[rb]:
|
|
297
|
-
self.parent[rb] = ra
|
|
298
|
-
else:
|
|
299
|
-
self.parent[rb] = ra
|
|
300
|
-
self.rank[ra] += 1
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
def _safe_id_sort_key(val):
|
|
304
|
-
"""Sort key that prefers numeric ordering when IDs look like ints."""
|
|
305
|
-
try:
|
|
306
|
-
return (0, int(val))
|
|
307
|
-
except Exception:
|
|
308
|
-
return (1, str(val))
|
|
309
|
-
|
|
310
261
|
|
|
311
|
-
def _jaccard_similarity(a: set, b: set) -> float:
|
|
312
|
-
if not a and not b:
|
|
313
|
-
return 1.0
|
|
314
|
-
if not a or not b:
|
|
315
|
-
return 0.0
|
|
316
|
-
inter = len(a.intersection(b))
|
|
317
|
-
if inter == 0:
|
|
318
|
-
return 0.0
|
|
319
|
-
union = len(a) + len(b) - inter
|
|
320
|
-
return inter / union
|
|
321
262
|
|
|
322
263
|
|
|
323
|
-
def
|
|
324
|
-
"""
|
|
264
|
+
def filter_duplicate_terms(terms: pd.DataFrame) -> pd.DataFrame:
|
|
265
|
+
"""Backward-compatible wrapper for exact-duplicate filtering.
|
|
325
266
|
|
|
326
|
-
|
|
327
|
-
This uses an exact Jaccard similarity join with prefix-filter candidate generation.
|
|
267
|
+
This removes exact duplicate `used_genes` sets and keeps the smallest ID.
|
|
328
268
|
"""
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
return terms
|
|
332
|
-
|
|
333
|
-
# Build IDs and gene sets
|
|
334
|
-
id_col = "ID" if "ID" in terms.columns else None
|
|
335
|
-
term_ids = (terms["ID"].tolist() if id_col else terms.index.tolist())
|
|
336
|
-
gene_sets = []
|
|
337
|
-
for genes in terms[genes_col].tolist():
|
|
338
|
-
gene_sets.append(set(genes))
|
|
339
|
-
|
|
340
|
-
sizes = [len(s) for s in gene_sets]
|
|
341
|
-
if len(gene_sets) <= 1:
|
|
342
|
-
return terms
|
|
343
|
-
|
|
344
|
-
# Global token frequency for ordering (rare tokens first)
|
|
345
|
-
from collections import Counter, defaultdict
|
|
346
|
-
freq = Counter()
|
|
347
|
-
for s in gene_sets:
|
|
348
|
-
freq.update(s)
|
|
349
|
-
|
|
350
|
-
def sort_tokens(s: set):
|
|
351
|
-
return sorted(s, key=lambda tok: (freq.get(tok, 0), str(tok)))
|
|
352
|
-
|
|
353
|
-
# Process smaller sets first (helps size filtering and keeps index smaller)
|
|
354
|
-
order = sorted(range(len(gene_sets)), key=lambda i: (sizes[i], _safe_id_sort_key(term_ids[i])))
|
|
355
|
-
ordered_tokens = [sort_tokens(gene_sets[i]) for i in range(len(gene_sets))]
|
|
356
|
-
|
|
357
|
-
# Inverted index over prefix tokens
|
|
358
|
-
inv_index = defaultdict(list) # token -> list of processed term indices (original idx)
|
|
359
|
-
|
|
360
|
-
uf = _UnionFind(len(gene_sets))
|
|
361
|
-
|
|
362
|
-
# Precompute prefix lengths
|
|
363
|
-
import math
|
|
364
|
-
prefix_len = []
|
|
365
|
-
for i in range(len(gene_sets)):
|
|
366
|
-
m = sizes[i]
|
|
367
|
-
# PPJoin prefix length for Jaccard threshold
|
|
368
|
-
p = m - math.ceil(threshold * m) + 1
|
|
369
|
-
if p < 0:
|
|
370
|
-
p = 0
|
|
371
|
-
if p > m:
|
|
372
|
-
p = m
|
|
373
|
-
prefix_len.append(p)
|
|
374
|
-
|
|
375
|
-
# Candidate generation + exact verification
|
|
376
|
-
for idx_pos, i in enumerate(order):
|
|
377
|
-
tokens_i = ordered_tokens[i]
|
|
378
|
-
p_i = prefix_len[i]
|
|
379
|
-
|
|
380
|
-
# Count shared prefix tokens with previously indexed sets
|
|
381
|
-
candidate_overlap_lb = defaultdict(int)
|
|
382
|
-
for tok in tokens_i[:p_i]:
|
|
383
|
-
for j in inv_index.get(tok, []):
|
|
384
|
-
# size filter: if too different in size, cannot meet Jaccard threshold
|
|
385
|
-
if sizes[j] < threshold * sizes[i]:
|
|
386
|
-
continue
|
|
387
|
-
if sizes[j] > sizes[i] / threshold:
|
|
388
|
-
continue
|
|
389
|
-
candidate_overlap_lb[j] += 1
|
|
390
|
-
inv_index[tok].append(i)
|
|
391
|
-
|
|
392
|
-
if not candidate_overlap_lb:
|
|
393
|
-
continue
|
|
394
|
-
|
|
395
|
-
set_i = gene_sets[i]
|
|
396
|
-
for j in candidate_overlap_lb.keys():
|
|
397
|
-
# Exact verification
|
|
398
|
-
sim = _jaccard_similarity(set_i, gene_sets[j])
|
|
399
|
-
if sim >= threshold:
|
|
400
|
-
uf.union(i, j)
|
|
401
|
-
|
|
402
|
-
# Choose representative (smallest ID) for each connected component
|
|
403
|
-
components = {}
|
|
404
|
-
for i in range(len(gene_sets)):
|
|
405
|
-
root = uf.find(i)
|
|
406
|
-
components.setdefault(root, []).append(i)
|
|
407
|
-
|
|
408
|
-
keep_original_indices = set()
|
|
409
|
-
for members in components.values():
|
|
410
|
-
# Keep smallest ID among members
|
|
411
|
-
keep = min(members, key=lambda k: _safe_id_sort_key(term_ids[k]))
|
|
412
|
-
keep_original_indices.add(keep)
|
|
413
|
-
|
|
414
|
-
if id_col:
|
|
415
|
-
keep_ids = {term_ids[i] for i in keep_original_indices}
|
|
416
|
-
return terms[terms["ID"].isin(keep_ids)].copy()
|
|
417
|
-
else:
|
|
418
|
-
keep_index = {term_ids[i] for i in keep_original_indices}
|
|
419
|
-
return terms[terms.index.isin(keep_index)].copy()
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
def filter_duplicate_terms(terms):
|
|
426
|
-
log.started("Filtering duplicate terms using optimized method.")
|
|
427
|
-
|
|
428
|
-
# Precompute frozen gene sets and hash them
|
|
269
|
+
log.started("Filtering duplicate terms using exact used_genes sets.")
|
|
270
|
+
before = len(terms)
|
|
429
271
|
terms = terms.copy()
|
|
430
272
|
terms["gene_set"] = terms["used_genes"].map(lambda x: frozenset(x))
|
|
431
|
-
|
|
432
|
-
# Group by identical gene sets
|
|
433
273
|
grouped = terms.groupby("gene_set", sort=False)
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
274
|
+
|
|
275
|
+
id_values = terms["ID"] if "ID" in terms.columns else terms.index
|
|
276
|
+
keep_ids = set(id_values)
|
|
437
277
|
for _, group in grouped:
|
|
438
|
-
if len(group)
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
# Determine which IDs to keep (smallest ID in each duplicate cluster)
|
|
442
|
-
keep_ids = set(terms["ID"])
|
|
443
|
-
for cluster in duplicate_clusters:
|
|
278
|
+
if len(group) <= 1:
|
|
279
|
+
continue
|
|
280
|
+
cluster = group["ID"].values if "ID" in group.columns else group.index.values
|
|
444
281
|
sorted_ids = sorted(cluster)
|
|
445
|
-
keep_ids.difference_update(sorted_ids[1:])
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
282
|
+
keep_ids.difference_update(sorted_ids[1:])
|
|
283
|
+
|
|
284
|
+
if "ID" in terms.columns:
|
|
285
|
+
filtered = terms[terms["ID"].isin(keep_ids)].copy()
|
|
286
|
+
else:
|
|
287
|
+
filtered = terms[terms.index.isin(keep_ids)].copy()
|
|
288
|
+
filtered.drop(columns=["gene_set"], inplace=True, errors="ignore")
|
|
289
|
+
log.done(f"{before - len(filtered)} terms removed due to identical gene sets.")
|
|
452
290
|
return filtered
|
pythonflex/utils.py
CHANGED
|
@@ -5,12 +5,39 @@ import joblib
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
8
|
+
from .logging_config import log
|
|
9
|
+
|
|
8
10
|
# Constants
|
|
9
11
|
TMP_ROOT = ".tmp"
|
|
10
|
-
VALID_EXTS = {".parquet", ".npy", ".pkl"} # Removed .feather
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
VALID_EXTS = {".parquet", ".npy", ".pkl"} # Removed .feather
|
|
13
|
+
|
|
14
|
+
ANALYSIS_GENES_ALIASES = {
|
|
15
|
+
"shared": "shared",
|
|
16
|
+
"common": "shared",
|
|
17
|
+
"dataset_specific": "dataset_specific",
|
|
18
|
+
"dataset": "dataset_specific",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def normalize_analysis_genes(value=None, legacy_use_common_genes=None):
|
|
23
|
+
"""Normalize analysis_genes config values while keeping legacy names working."""
|
|
24
|
+
if value is None or str(value).strip() == "":
|
|
25
|
+
if legacy_use_common_genes is not None:
|
|
26
|
+
return "shared" if bool(legacy_use_common_genes) else "dataset_specific"
|
|
27
|
+
value = "shared"
|
|
28
|
+
|
|
29
|
+
key = str(value).strip().lower().replace("-", "_").replace(" ", "_")
|
|
30
|
+
if key not in ANALYSIS_GENES_ALIASES:
|
|
31
|
+
raise ValueError(
|
|
32
|
+
"config['analysis_genes'] must be one of "
|
|
33
|
+
"['shared', 'dataset_specific'] "
|
|
34
|
+
"(legacy aliases ['common', 'dataset'] are also supported), "
|
|
35
|
+
f"got {value!r}"
|
|
36
|
+
)
|
|
37
|
+
return ANALYSIS_GENES_ALIASES[key]
|
|
38
|
+
|
|
39
|
+
# Helper to sanitize names (make filesystem-safe)
|
|
40
|
+
def _sanitize(name):
|
|
14
41
|
if not name:
|
|
15
42
|
return "data"
|
|
16
43
|
# Replace forbidden/problematic chars with '_', collapse multiples, strip edges
|
|
@@ -59,7 +86,7 @@ def dload(category, name=None, path=None):
|
|
|
59
86
|
dir_path = os.path.join(TMP_ROOT, _sanitize(category))
|
|
60
87
|
|
|
61
88
|
if not os.path.exists(dir_path):
|
|
62
|
-
return {}
|
|
89
|
+
return {} if name is None else None
|
|
63
90
|
|
|
64
91
|
if name is None:
|
|
65
92
|
# Load all in category as dict
|
|
@@ -77,7 +104,7 @@ def dload(category, name=None, path=None):
|
|
|
77
104
|
elif filename.endswith(".pkl"):
|
|
78
105
|
out[k] = joblib.load(full_path, mmap_mode="r")
|
|
79
106
|
except (EOFError, ValueError, OSError):
|
|
80
|
-
|
|
107
|
+
log.warning(f"'{full_path}' is corrupted. Skipping...")
|
|
81
108
|
os.remove(full_path)
|
|
82
109
|
return out
|
|
83
110
|
|
|
@@ -93,9 +120,9 @@ def dload(category, name=None, path=None):
|
|
|
93
120
|
elif ext == ".pkl":
|
|
94
121
|
return joblib.load(target, mmap_mode="r")
|
|
95
122
|
except (EOFError, ValueError, OSError) as e:
|
|
96
|
-
|
|
123
|
+
log.warning(f"'{target}' is corrupted ({e}). Trying next format...")
|
|
97
124
|
os.remove(target)
|
|
98
125
|
continue
|
|
99
126
|
|
|
100
|
-
|
|
101
|
-
return
|
|
127
|
+
log.warning(f"No valid file found for {category}/{name}")
|
|
128
|
+
return None
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pythonflex
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4
|
|
4
4
|
Summary: pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data.
|
|
5
5
|
Author-email: Yasir Demirtaş <tyasird@hotmail.com>
|
|
6
|
+
License-File: LICENSE
|
|
6
7
|
Classifier: License :: OSI Approved :: MIT License
|
|
7
8
|
Classifier: Operating System :: OS Independent
|
|
8
9
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -114,7 +115,10 @@ default_config = {
|
|
|
114
115
|
"gold_standard": "GOBP",
|
|
115
116
|
"color_map": "RdYlBu",
|
|
116
117
|
"jaccard": True,
|
|
117
|
-
|
|
118
|
+
# Which genes define the evaluated space:
|
|
119
|
+
# - 'common' : intersect terms with genes common across datasets
|
|
120
|
+
# - 'dataset' : intersect terms with genes present in each dataset
|
|
121
|
+
"analysis_genes": "common",
|
|
118
122
|
"plotting": {
|
|
119
123
|
"save_plot": True,
|
|
120
124
|
"output_type": "png",
|
|
@@ -139,7 +143,7 @@ terms, genes_in_terms = flex.load_gold_standard()
|
|
|
139
143
|
|
|
140
144
|
# Run analysis
|
|
141
145
|
for name, dataset in data.items():
|
|
142
|
-
df
|
|
146
|
+
df = flex.pra(name, dataset)
|
|
143
147
|
fpc = flex.pra_percomplex(name, dataset, is_corr=False)
|
|
144
148
|
cc = flex.complex_contributions(name)
|
|
145
149
|
|
|
@@ -150,9 +154,7 @@ flex.plot_percomplex_scatter()
|
|
|
150
154
|
flex.plot_percomplex_scatter_bysize()
|
|
151
155
|
flex.plot_significant_complexes()
|
|
152
156
|
flex.plot_complex_contributions()
|
|
153
|
-
flex.
|
|
154
|
-
flex.plot_mpr_complexes_multi(show_filters="all")
|
|
155
|
-
flex.plot_mpr_complexes_auc_scores("all")
|
|
157
|
+
flex.plot_mpr_summary(variants="unfiltered")
|
|
156
158
|
|
|
157
159
|
flex.save_results_to_csv()
|
|
158
160
|
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
pythonflex/__init__.py,sha256=D3go2UnojWzRAFr8ahjrPwDTR3np__hDHqaAhBqUGHc,2640
|
|
2
|
+
pythonflex/analysis.py,sha256=H164EqsG5ewgshaYQOXUOh5pjF7TcWXGsmQvWzHFT6M,64615
|
|
3
|
+
pythonflex/logging_config.py,sha256=iqRKK18zvtfV_-bYHWrXtSZywiUtYxoHkw0ZnVORQBQ,2015
|
|
4
|
+
pythonflex/old_functions.py,sha256=regtkNGCS3ph0OBssAg8Sg1ivn1-kiRB54Q7xIVeQ4E,19031
|
|
5
|
+
pythonflex/plotting.py,sha256=Ie0F79q-DSkn8YzytjJf2FQthfvY_OWlR3B_sJMpXac,89247
|
|
6
|
+
pythonflex/preprocessing.py,sha256=XwLul6MQN2Gzjg-_JH0I14qklRslO8f6GCoYk3JpWpA,11251
|
|
7
|
+
pythonflex/utils.py,sha256=WqsCEnccNGIEwWuq_r34mIqDBBuu8mkFXW4zPIm0IwA,4665
|
|
8
|
+
pythonflex/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
pythonflex/data/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
pythonflex/data/dataset/liver_cell_lines_500_genes.csv,sha256=qfKsqPjL41Y1GuxxAhc-MfaNO0mX6Qju_SeynKSpEiM,238639
|
|
11
|
+
pythonflex/data/dataset/melanoma_cell_lines_500_genes.csv,sha256=ByxcaDDqLlRtAyuCKhHeFQCitIBk2-Q4Hn6k8BNUF6c,620887
|
|
12
|
+
pythonflex/data/dataset/neuroblastoma_cell_lines_500_genes.csv,sha256=IxJI8E-smagbxHRvTjvQLZxuu89MuCw8XrReMSsViUI,365992
|
|
13
|
+
pythonflex/data/gold_standard/CORUM.parquet,sha256=AkLiflQAeQ6K3HG-PIdLbZ8vEF9GtNObtlY7TkxHyaw,131858
|
|
14
|
+
pythonflex/data/gold_standard/GOBP.parquet,sha256=YQGhRcHSiN_cMKytCUYNCfcDwYj9L3TLFNwobiS2f3M,1025099
|
|
15
|
+
pythonflex/data/gold_standard/PATHWAY.parquet,sha256=bFRDe3PQ_TFc7B1uZuynwOGcgxESLaOy1Zt5gpQ1Oso,277386
|
|
16
|
+
pythonflex/data/gold_standard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
+
pythonflex/data/gold_standard/corum.csv,sha256=2rZeyr2Ghm7f-gFxCZnhPtxI2jxRoiZMUEH2EJwAgsI,208889
|
|
18
|
+
pythonflex/data/gold_standard/gobp.csv,sha256=TO9yfx9mO8WkXvWfSB-pFId9T8xYfqdZpshAXC0Fyj8,1739167
|
|
19
|
+
pythonflex/data/gold_standard/pathway.csv,sha256=J3HKVLUZ_Oxucmn_14ieYp3Wr2lcKtp0nIl4_8_K2Yc,489424
|
|
20
|
+
pythonflex/examples/basic_usage.py,sha256=06IPn_yon9DwXefpfb0Fo6uG_5CyRCT_ZCMZSPM-3Ww,2695
|
|
21
|
+
pythonflex/examples/manuscript.py,sha256=AyzycXys9BII4TZFyBi-bXksAWlrTcFZHCdc2gu8D0U,2984
|
|
22
|
+
pythonflex/examples/runtime/runtime_benchmark.py,sha256=kjOELNHzbk8Y-DDe1IPl75R2UNnxfdgxZBR_SDdcQgc,5688
|
|
23
|
+
pythonflex/examples/runtime/runtime_benchmark_10_runs_memmap.py,sha256=kXQ8ycnqmWxbjoEWoyT1DC3O1JWl90tlNvA7KVskfz4,14832
|
|
24
|
+
pythonflex/examples/runtime/runtime_benchmark_corum_njobs.py,sha256=n6ON4cRHuN5y1YU32OH2lsXF-TPNQ4B2635MtwCOXKY,6361
|
|
25
|
+
pythonflex/examples/runtime/runtime_benchmark_gobp_njobs_chunks.py,sha256=sC3cMV8MxFqT8GWFgskkIMx6jz7if6BWWkhC7jip0ks,9130
|
|
26
|
+
pythonflex/examples/runtime/runtime_benchmark_gobp_optimization.py,sha256=1Us2sISWbjaM2hahQ1hG-XSrlTaWzcDnvVQrItHo_ls,11934
|
|
27
|
+
pythonflex/examples/runtime/runtime_benchmark_repeated.py,sha256=6-NV1aLbScdmH13heu6_rROAJk0L1VZ1byO95kMtzX8,9666
|
|
28
|
+
pythonflex-0.4.dist-info/METADATA,sha256=TxzZaLktNrLQvfHu2xcbWPr_nOt7VaHz2sO24BFm7-Q,4463
|
|
29
|
+
pythonflex-0.4.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
30
|
+
pythonflex-0.4.dist-info/entry_points.txt,sha256=37liK1baI_CRVDivpjsn8JDClL9_YeTTuSMAZ3Ty7oE,47
|
|
31
|
+
pythonflex-0.4.dist-info/licenses/LICENSE,sha256=buBzPy38DV0g95acwIGUqsWZwhbpc0tQXnz6nBhjyS8,1091
|
|
32
|
+
pythonflex-0.4.dist-info/RECORD,,
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2026, Yasir Demirtas, Maximilian Billmann
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
pythonflex/__init__.py,sha256=MoDbdVhclK_PF_u9vzN4ntWX6hTRAKfvkTiDisIci5o,1748
|
|
2
|
-
pythonflex/analysis.py,sha256=s9uX4FMXzQ-OaUisf3jzgYLB0JVpfWPe87u9nz5i-y4,77795
|
|
3
|
-
pythonflex/logging_config.py,sha256=iqRKK18zvtfV_-bYHWrXtSZywiUtYxoHkw0ZnVORQBQ,2015
|
|
4
|
-
pythonflex/plotting.py,sha256=AOzgyhJX5bPMoGs2ih2zbA30Dm-OoWpk8MNBC-9OQ94,75981
|
|
5
|
-
pythonflex/preprocessing.py,sha256=_Ecv6ASntz1vESI4EBw1O_9MUKkg9brvmZP9HmDnlLA,17354
|
|
6
|
-
pythonflex/utils.py,sha256=7toGnKbA_TKBtHz1HLk7ckWM0bjuFw_Byhp6ZUJaNs4,3694
|
|
7
|
-
pythonflex/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
pythonflex/data/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
pythonflex/data/dataset/liver_cell_lines_500_genes.csv,sha256=qfKsqPjL41Y1GuxxAhc-MfaNO0mX6Qju_SeynKSpEiM,238639
|
|
10
|
-
pythonflex/data/dataset/melanoma_cell_lines_500_genes.csv,sha256=ByxcaDDqLlRtAyuCKhHeFQCitIBk2-Q4Hn6k8BNUF6c,620887
|
|
11
|
-
pythonflex/data/dataset/neuroblastoma_cell_lines_500_genes.csv,sha256=IxJI8E-smagbxHRvTjvQLZxuu89MuCw8XrReMSsViUI,365992
|
|
12
|
-
pythonflex/data/gold_standard/CORUM.parquet,sha256=AkLiflQAeQ6K3HG-PIdLbZ8vEF9GtNObtlY7TkxHyaw,131858
|
|
13
|
-
pythonflex/data/gold_standard/GOBP.parquet,sha256=YQGhRcHSiN_cMKytCUYNCfcDwYj9L3TLFNwobiS2f3M,1025099
|
|
14
|
-
pythonflex/data/gold_standard/PATHWAY.parquet,sha256=bFRDe3PQ_TFc7B1uZuynwOGcgxESLaOy1Zt5gpQ1Oso,277386
|
|
15
|
-
pythonflex/data/gold_standard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
pythonflex/data/gold_standard/corum.csv,sha256=2rZeyr2Ghm7f-gFxCZnhPtxI2jxRoiZMUEH2EJwAgsI,208889
|
|
17
|
-
pythonflex/data/gold_standard/gobp.csv,sha256=TO9yfx9mO8WkXvWfSB-pFId9T8xYfqdZpshAXC0Fyj8,1739167
|
|
18
|
-
pythonflex/data/gold_standard/pathway.csv,sha256=J3HKVLUZ_Oxucmn_14ieYp3Wr2lcKtp0nIl4_8_K2Yc,489424
|
|
19
|
-
pythonflex/examples/basic_usage.py,sha256=hYDj6dkmamnW6jC0MMtfYKDlFrnUBxSxVbmY74Mhy3Q,2291
|
|
20
|
-
pythonflex/examples/manuscript.py,sha256=ru0AphvwUwV0Tn5tCkED7NGaxphmrPN7DYcO7IlhyLk,2845
|
|
21
|
-
pythonflex-0.3.4.dist-info/METADATA,sha256=vQl1I9dqPJzekvCjj2prIsP3_D6KejE894HdYa9nSQ4,4400
|
|
22
|
-
pythonflex-0.3.4.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
23
|
-
pythonflex-0.3.4.dist-info/entry_points.txt,sha256=37liK1baI_CRVDivpjsn8JDClL9_YeTTuSMAZ3Ty7oE,47
|
|
24
|
-
pythonflex-0.3.4.dist-info/RECORD,,
|
|
File without changes
|