pythonflex 0.3.3__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {pythonflex-0.3.3 → pythonflex-0.3.4}/PKG-INFO +6 -3
  2. {pythonflex-0.3.3 → pythonflex-0.3.4}/README.md +5 -2
  3. {pythonflex-0.3.3 → pythonflex-0.3.4}/pyproject.toml +1 -1
  4. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/analysis.py +1 -0
  5. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/examples/basic_usage.py +9 -9
  6. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/examples/manuscript.py +1 -0
  7. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/preprocessing.py +188 -24
  8. {pythonflex-0.3.3 → pythonflex-0.3.4}/.gitignore +0 -0
  9. {pythonflex-0.3.3 → pythonflex-0.3.4}/.python-version +0 -0
  10. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/__init__.py +0 -0
  11. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/__init__.py +0 -0
  12. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/dataset/__init__.py +0 -0
  13. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/dataset/liver_cell_lines_500_genes.csv +0 -0
  14. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/dataset/melanoma_cell_lines_500_genes.csv +0 -0
  15. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/dataset/neuroblastoma_cell_lines_500_genes.csv +0 -0
  16. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/gold_standard/CORUM.parquet +0 -0
  17. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/gold_standard/GOBP.parquet +0 -0
  18. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/gold_standard/PATHWAY.parquet +0 -0
  19. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/gold_standard/__init__.py +0 -0
  20. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/gold_standard/corum.csv +0 -0
  21. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/gold_standard/gobp.csv +0 -0
  22. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/data/gold_standard/pathway.csv +0 -0
  23. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/logging_config.py +0 -0
  24. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/plotting.py +0 -0
  25. {pythonflex-0.3.3 → pythonflex-0.3.4}/src/pythonflex/utils.py +0 -0
  26. {pythonflex-0.3.3 → pythonflex-0.3.4}/todo.txt +0 -0
  27. {pythonflex-0.3.3 → pythonflex-0.3.4}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pythonflex
3
- Version: 0.3.3
3
+ Version: 0.3.4
4
4
  Summary: pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data.
5
5
  Author-email: Yasir Demirtaş <tyasird@hotmail.com>
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -114,6 +114,7 @@ default_config = {
114
114
  "gold_standard": "GOBP",
115
115
  "color_map": "RdYlBu",
116
116
  "jaccard": True,
117
+ "jaccard_threshold": 1.0, # set e.g. 0.90 to remove highly similar terms
117
118
  "plotting": {
118
119
  "save_plot": True,
119
120
  "output_type": "png",
@@ -124,7 +125,7 @@ default_config = {
124
125
  },
125
126
  "corr_function": "numpy",
126
127
  "logging": {
127
- "visible_levels": ["DONE","STARTED"] # "PROGRESS", "STARTED", ,"INFO","WARNING"
128
+ "visible_levels": ["DONE","INFO", "WARNING"] # "PROGRESS", "STARTED", ,"INFO","WARNING"
128
129
  }
129
130
  }
130
131
 
@@ -149,8 +150,10 @@ flex.plot_percomplex_scatter()
149
150
  flex.plot_percomplex_scatter_bysize()
150
151
  flex.plot_significant_complexes()
151
152
  flex.plot_complex_contributions()
153
+ flex.plot_mpr_tp_multi(show_filters="all")
154
+ flex.plot_mpr_complexes_multi(show_filters="all")
155
+ flex.plot_mpr_complexes_auc_scores("all")
152
156
 
153
- # Save Result CSVspyflex.save_results_to_csv()
154
157
  flex.save_results_to_csv()
155
158
 
156
159
 
@@ -83,6 +83,7 @@ default_config = {
83
83
  "gold_standard": "GOBP",
84
84
  "color_map": "RdYlBu",
85
85
  "jaccard": True,
86
+ "jaccard_threshold": 1.0, # set e.g. 0.90 to remove highly similar terms
86
87
  "plotting": {
87
88
  "save_plot": True,
88
89
  "output_type": "png",
@@ -93,7 +94,7 @@ default_config = {
93
94
  },
94
95
  "corr_function": "numpy",
95
96
  "logging": {
96
- "visible_levels": ["DONE","STARTED"] # "PROGRESS", "STARTED", ,"INFO","WARNING"
97
+ "visible_levels": ["DONE","INFO", "WARNING"] # "PROGRESS", "STARTED", ,"INFO","WARNING"
97
98
  }
98
99
  }
99
100
 
@@ -118,8 +119,10 @@ flex.plot_percomplex_scatter()
118
119
  flex.plot_percomplex_scatter_bysize()
119
120
  flex.plot_significant_complexes()
120
121
  flex.plot_complex_contributions()
122
+ flex.plot_mpr_tp_multi(show_filters="all")
123
+ flex.plot_mpr_complexes_multi(show_filters="all")
124
+ flex.plot_mpr_complexes_auc_scores("all")
121
125
 
122
- # Save Result CSVspyflex.save_results_to_csv()
123
126
  flex.save_results_to_csv()
124
127
 
125
128
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pythonflex"
3
- version = "0.3.3"
3
+ version = "0.3.4"
4
4
  description = "pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -43,6 +43,7 @@ def initialize(config={}):
43
43
  "gold_standard": "CORUM",
44
44
  "color_map": "RdYlBu",
45
45
  "jaccard": True,
46
+ "jaccard_threshold": 1.0,
46
47
  "use_common_genes": True,
47
48
  "plotting": {
48
49
  "save_plot": True,
@@ -31,7 +31,8 @@ default_config = {
31
31
  "output_folder": "CORUM",
32
32
  "gold_standard": "CORUM",
33
33
  "color_map": "BuGn",
34
- "jaccard": False,
34
+ "jaccard": True,
35
+ "jaccard_threshold": 1,
35
36
  "use_common_genes": False, # Set to False for individual dataset-gold standard intersections
36
37
  "plotting": {
37
38
  "save_plot": True,
@@ -61,6 +62,7 @@ for name, dataset in data.items():
61
62
  fpc = flex.pra_percomplex(name, dataset, is_corr=False)
62
63
  cc = flex.complex_contributions(name)
63
64
  flex.mpr_prepare(name)
65
+
64
66
 
65
67
 
66
68
 
@@ -73,15 +75,13 @@ for name, dataset in data.items():
73
75
  # flex.plot_percomplex_scatter(n_top=20)
74
76
  # flex.plot_percomplex_scatter_bysize()
75
77
  # flex.plot_complex_contributions()
76
- #%%
77
- #flex.plot_mpr_tp_multi(show_filters="all")
78
- flex.plot_mpr_complexes_multi(show_filters="all")
78
+ # flex.plot_mpr_tp_multi(show_filters="all")
79
+ # flex.plot_mpr_complexes_multi(show_filters="all")
80
+ # flex.plot_mpr_complexes_auc_scores("all")
81
+
82
+
79
83
 
80
84
  #%%
81
85
  # Save results to CSV
82
- flex.save_results_to_csv()
83
-
86
+ # flex.save_results_to_csv()
84
87
 
85
- # %%
86
- flex.plot_mpr_complexes_auc_scores("all")
87
- # %%
@@ -58,6 +58,7 @@ default_config = {
58
58
  "gold_standard": "CORUM",
59
59
  "color_map": "BuGn",
60
60
  "jaccard": False,
61
+ "jaccard_threshold": 1.0,
61
62
  "use_common_genes": False, # Set to False for individual dataset-gold standard intersections
62
63
  "plotting": {
63
64
  "save_plot": True,
@@ -189,7 +189,18 @@ def load_gold_standard():
189
189
  use_common_genes = config.get("use_common_genes", True)
190
190
 
191
191
  gold_standard_source = config['gold_standard']
192
- log.started(f"Loading gold standard: {gold_standard_source}, Min complex size: {config['min_genes_in_complex']}, Jaccard filtering: {config['jaccard']}, use_common_genes: {use_common_genes}")
192
+ jaccard_enabled = bool(config.get("jaccard", False))
193
+ jaccard_threshold_raw = config.get("jaccard_threshold", 1.0)
194
+ try:
195
+ jaccard_threshold = float(jaccard_threshold_raw) # type: ignore[arg-type]
196
+ except (TypeError, ValueError):
197
+ raise ValueError(
198
+ f"config['jaccard_threshold'] must be a number in (0, 1], got {jaccard_threshold_raw!r}"
199
+ )
200
+ log.done(
201
+ f"Loading gold standard: {gold_standard_source}, Min complex size: {config['min_genes_in_complex']}, "
202
+ f"Jaccard filtering: {jaccard_enabled} (threshold={jaccard_threshold}), use_common_genes: {use_common_genes}"
203
+ )
193
204
 
194
205
  # Define gold standard file paths for predefined sources
195
206
  gold_standard_files = {
@@ -217,34 +228,44 @@ def load_gold_standard():
217
228
 
218
229
  # Store raw gold standard for later per-dataset filtering
219
230
  terms["all_genes"] = terms["Genes"].apply(lambda x: list(set(x.split(";"))))
220
- log.info(f"Gold standard loaded with {len(terms)} terms")
231
+ log.done(f"Gold standard loaded with {len(terms)} terms")
221
232
 
222
233
  # Basic filtering by minimum complex size (before gene filtering)
223
234
  terms["n_all_genes"] = terms["all_genes"].apply(len)
224
235
  terms = terms[terms["n_all_genes"] >= config['min_genes_in_complex']]
225
- log.info(f"After min_genes_in_complex filtering: {len(terms)} terms")
226
-
227
- if config['jaccard']:
228
- log.info("Applying Jaccard filtering. Remove terms with identical gene sets.")
229
- # Use all genes for jaccard filtering
230
- terms["gene_set"] = terms["all_genes"].map(lambda x: frozenset(x))
231
- grouped = terms.groupby("gene_set", sort=False)
232
- duplicate_clusters = []
233
- for _, group in grouped:
234
- if len(group) > 1:
235
- duplicate_clusters.append(group["ID"].values if "ID" in group.columns else group.index.values)
236
-
237
- keep_ids = set(terms["ID"] if "ID" in terms.columns else terms.index)
238
- for cluster in duplicate_clusters:
239
- sorted_ids = sorted(cluster)
240
- keep_ids.difference_update(sorted_ids[1:])
241
-
242
- if "ID" in terms.columns:
243
- terms = terms[terms["ID"].isin(keep_ids)].copy()
236
+ log.done(f"After min_genes_in_complex filtering: {len(terms)} terms")
237
+
238
+ if jaccard_enabled:
239
+ if not (0.0 < jaccard_threshold <= 1.0):
240
+ raise ValueError(f"config['jaccard_threshold'] must be in (0, 1], got {jaccard_threshold}")
241
+
242
+ if jaccard_threshold >= 1.0:
243
+ log.done("Applying Jaccard filtering (threshold=1.0). Removing terms with identical gene sets.")
244
+ # Use all genes for jaccard filtering
245
+ terms["gene_set"] = terms["all_genes"].map(lambda x: frozenset(x))
246
+ grouped = terms.groupby("gene_set", sort=False)
247
+ duplicate_clusters = []
248
+ for _, group in grouped:
249
+ if len(group) > 1:
250
+ duplicate_clusters.append(group["ID"].values if "ID" in group.columns else group.index.values)
251
+
252
+ keep_ids = set(terms["ID"] if "ID" in terms.columns else terms.index)
253
+ for cluster in duplicate_clusters:
254
+ sorted_ids = sorted(cluster)
255
+ keep_ids.difference_update(sorted_ids[1:])
256
+
257
+ if "ID" in terms.columns:
258
+ terms = terms[terms["ID"].isin(keep_ids)].copy()
259
+ else:
260
+ terms = terms[terms.index.isin(keep_ids)].copy()
261
+ terms.drop(columns=["gene_set"], inplace=True, errors="ignore")
262
+ log.done(f"After Jaccard filtering: {len(terms)} terms")
244
263
  else:
245
- terms = terms[terms.index.isin(keep_ids)].copy()
246
- terms.drop(columns=["gene_set"], inplace=True, errors='ignore')
247
- log.info(f"After Jaccard filtering: {len(terms)} terms")
264
+ log.done(
265
+ f"Applying Jaccard filtering (threshold={jaccard_threshold}). Removing highly similar terms."
266
+ )
267
+ terms = _filter_terms_by_jaccard_threshold(terms, threshold=jaccard_threshold, genes_col="all_genes")
268
+ log.done(f"After Jaccard filtering: {len(terms)} terms")
248
269
 
249
270
  # if there is column called "ID", set it as index
250
271
  if "ID" in terms.columns:
@@ -255,6 +276,149 @@ def load_gold_standard():
255
276
  return terms, None # Return None for genes_present_in_terms - will be computed per dataset
256
277
 
257
278
 
279
+ class _UnionFind:
280
+ def __init__(self, n: int):
281
+ self.parent = list(range(n))
282
+ self.rank = [0] * n
283
+
284
+ def find(self, x: int) -> int:
285
+ while self.parent[x] != x:
286
+ self.parent[x] = self.parent[self.parent[x]]
287
+ x = self.parent[x]
288
+ return x
289
+
290
+ def union(self, a: int, b: int) -> None:
291
+ ra, rb = self.find(a), self.find(b)
292
+ if ra == rb:
293
+ return
294
+ if self.rank[ra] < self.rank[rb]:
295
+ self.parent[ra] = rb
296
+ elif self.rank[ra] > self.rank[rb]:
297
+ self.parent[rb] = ra
298
+ else:
299
+ self.parent[rb] = ra
300
+ self.rank[ra] += 1
301
+
302
+
303
+ def _safe_id_sort_key(val):
304
+ """Sort key that prefers numeric ordering when IDs look like ints."""
305
+ try:
306
+ return (0, int(val))
307
+ except Exception:
308
+ return (1, str(val))
309
+
310
+
311
+ def _jaccard_similarity(a: set, b: set) -> float:
312
+ if not a and not b:
313
+ return 1.0
314
+ if not a or not b:
315
+ return 0.0
316
+ inter = len(a.intersection(b))
317
+ if inter == 0:
318
+ return 0.0
319
+ union = len(a) + len(b) - inter
320
+ return inter / union
321
+
322
+
323
+ def _filter_terms_by_jaccard_threshold(terms: pd.DataFrame, threshold: float, genes_col: str = "all_genes") -> pd.DataFrame:
324
+ """Remove near-duplicate terms whose gene sets have Jaccard similarity >= threshold.
325
+
326
+ Keeps one representative per similarity-connected component (smallest ID).
327
+ This uses an exact Jaccard similarity join with prefix-filter candidate generation.
328
+ """
329
+ if not (0.0 < threshold < 1.0):
330
+ # threshold == 1.0 handled elsewhere; invalid values rejected earlier
331
+ return terms
332
+
333
+ # Build IDs and gene sets
334
+ id_col = "ID" if "ID" in terms.columns else None
335
+ term_ids = (terms["ID"].tolist() if id_col else terms.index.tolist())
336
+ gene_sets = []
337
+ for genes in terms[genes_col].tolist():
338
+ gene_sets.append(set(genes))
339
+
340
+ sizes = [len(s) for s in gene_sets]
341
+ if len(gene_sets) <= 1:
342
+ return terms
343
+
344
+ # Global token frequency for ordering (rare tokens first)
345
+ from collections import Counter, defaultdict
346
+ freq = Counter()
347
+ for s in gene_sets:
348
+ freq.update(s)
349
+
350
+ def sort_tokens(s: set):
351
+ return sorted(s, key=lambda tok: (freq.get(tok, 0), str(tok)))
352
+
353
+ # Process smaller sets first (helps size filtering and keeps index smaller)
354
+ order = sorted(range(len(gene_sets)), key=lambda i: (sizes[i], _safe_id_sort_key(term_ids[i])))
355
+ ordered_tokens = [sort_tokens(gene_sets[i]) for i in range(len(gene_sets))]
356
+
357
+ # Inverted index over prefix tokens
358
+ inv_index = defaultdict(list) # token -> list of processed term indices (original idx)
359
+
360
+ uf = _UnionFind(len(gene_sets))
361
+
362
+ # Precompute prefix lengths
363
+ import math
364
+ prefix_len = []
365
+ for i in range(len(gene_sets)):
366
+ m = sizes[i]
367
+ # PPJoin prefix length for Jaccard threshold
368
+ p = m - math.ceil(threshold * m) + 1
369
+ if p < 0:
370
+ p = 0
371
+ if p > m:
372
+ p = m
373
+ prefix_len.append(p)
374
+
375
+ # Candidate generation + exact verification
376
+ for idx_pos, i in enumerate(order):
377
+ tokens_i = ordered_tokens[i]
378
+ p_i = prefix_len[i]
379
+
380
+ # Count shared prefix tokens with previously indexed sets
381
+ candidate_overlap_lb = defaultdict(int)
382
+ for tok in tokens_i[:p_i]:
383
+ for j in inv_index.get(tok, []):
384
+ # size filter: if too different in size, cannot meet Jaccard threshold
385
+ if sizes[j] < threshold * sizes[i]:
386
+ continue
387
+ if sizes[j] > sizes[i] / threshold:
388
+ continue
389
+ candidate_overlap_lb[j] += 1
390
+ inv_index[tok].append(i)
391
+
392
+ if not candidate_overlap_lb:
393
+ continue
394
+
395
+ set_i = gene_sets[i]
396
+ for j in candidate_overlap_lb.keys():
397
+ # Exact verification
398
+ sim = _jaccard_similarity(set_i, gene_sets[j])
399
+ if sim >= threshold:
400
+ uf.union(i, j)
401
+
402
+ # Choose representative (smallest ID) for each connected component
403
+ components = {}
404
+ for i in range(len(gene_sets)):
405
+ root = uf.find(i)
406
+ components.setdefault(root, []).append(i)
407
+
408
+ keep_original_indices = set()
409
+ for members in components.values():
410
+ # Keep smallest ID among members
411
+ keep = min(members, key=lambda k: _safe_id_sort_key(term_ids[k]))
412
+ keep_original_indices.add(keep)
413
+
414
+ if id_col:
415
+ keep_ids = {term_ids[i] for i in keep_original_indices}
416
+ return terms[terms["ID"].isin(keep_ids)].copy()
417
+ else:
418
+ keep_index = {term_ids[i] for i in keep_original_indices}
419
+ return terms[terms.index.isin(keep_index)].copy()
420
+
421
+
258
422
 
259
423
 
260
424
 
File without changes
File without changes
File without changes
File without changes