cfi-toolkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,672 @@
1
+ import copy
2
+ import os
3
+ import pickle
4
+ import sys
5
+
6
+ import pandas as pd
7
+ from tqdm import tqdm
8
+
9
+ _old_stdout = sys.stdout
10
+ sys.stdout = open(os.devnull, "w")
11
+
12
+ from gedspy import Analysis, Enrichment
13
+
14
+ sys.stdout.close()
15
+ sys.stdout = _old_stdout
16
+
17
+
18
+ class CellFunCon:
19
+ """
20
+ A class to perform cell-type functional analysis and enrichment based on a JDtI-COMPsc objects.
21
+
22
+ This class provides methods to calculate marker genes for cell types, perform functional enrichment
23
+ (GO, KEGG, REACTOME, STRING, IntAct), and compute cell-cell interaction networks.
24
+ Projects can also be saved and loaded via pickle.
25
+
26
+ Attributes
27
+ ----------
28
+ jdti : object
29
+ JDtI-COMPsc object containing normalized single-cell data.
30
+
31
+ cells_markers : pd.DataFrame or None
32
+ DataFrame containing marker genes per cell type after calculation.
33
+
34
+ enr_full_info : Enrichment
35
+ Enrichment object containing all genes available for enrichment analysis.
36
+
37
+ cells_enrichment : dict or None
38
+ Dictionary storing enrichment results per cell type.
39
+
40
+ cells_connection : pd.DataFrame or None
41
+ DataFrame storing calculated cell-cell interaction information.
42
+
43
+ mt_genes : bool
44
+ Whether mitochondrial genes are included (default False).
45
+
46
+ ribo_genes : bool
47
+ Whether ribosomal genes are included (default False).
48
+ """
49
+
50
+ def __init__(self, jdti_object, mt_genes=False, ribo_genes=False):
51
+ """
52
+ Initializes the CellFunCon object with a COMPsc/JDTI object.
53
+
54
+ Parameters
55
+ ----------
56
+ jdti_object : object
57
+ A COMPsc or JDTI object with normalized single-cell data.
58
+
59
+ mt_genes : bool
60
+ Whether mitochondrial genes are included (default False).
61
+
62
+ ribo_genes : bool
63
+ Whether ribosomal genes are included (default False).
64
+ """
65
+
66
+ self.jdti = jdti_object
67
+ """JDtI-COMPsc object containing normalized single-cell data."""
68
+
69
+ self.cells_markers = None
70
+ """DataFrame containing marker genes per cell type after calculation."""
71
+
72
+ self.cells_connection = None
73
+ """DataFrame storing calculated cell-cell interaction information."""
74
+
75
+ self.cells_enrichment = None
76
+ """Dictionary storing enrichment results per cell type."""
77
+
78
+ self.mt_genes = mt_genes
79
+ """Whether mitochondrial genes are included (default False)."""
80
+
81
+ self.ribo_genes = ribo_genes
82
+ """Whether ribosomal genes are included (default False)."""
83
+
84
+ names = self.jdti.normalized_data.loc[
85
+ self.jdti.normalized_data.select_dtypes(include="number").sum(axis=1) > 0
86
+ ].index.tolist()
87
+ names = list(set(names))
88
+
89
+ if self.mt_genes is False:
90
+ names = [x for x in names if "MT-" not in x.upper()]
91
+ if self.ribo_genes is False:
92
+ names = [x for x in names if "RPS" != x[:3].upper()]
93
+ names = [x for x in names if "RPL" != x[:3].upper()]
94
+
95
+ enr = Enrichment()
96
+ enr.select_features(names)
97
+
98
+ self.enr_full_info = enr
99
+ """Enrichment object containing all genes available for enrichment analysis."""
100
+
101
+ def save_project(self, filename):
102
+ """
103
+ Saves the current CellFunCon project as a pickle file.
104
+
105
+ Parameters
106
+ ----------
107
+ filename : str
108
+ Path to save the project (e.g., 'project_name').
109
+
110
+ Example
111
+ -------
112
+ >>> self.save_project('my_project')
113
+ """
114
+
115
+ with open(f"{filename}.psc", "wb") as f:
116
+ pickle.dump(self, f)
117
+ print(f"Project saved as {filename}")
118
+
119
+ @classmethod
120
+ def load_project(cls, filename):
121
+ """
122
+ Loads a previously saved CellFunCon project from a pickle file.
123
+
124
+ Parameters
125
+ ----------
126
+ filename : str
127
+ Path to the saved pickle file.
128
+
129
+ Returns
130
+ -------
131
+ CellFunCon
132
+ Loaded CellFunCon self.
133
+
134
+ Raises
135
+ ------
136
+ TypeError
137
+ If the loaded object is not a CellFunCon self.
138
+
139
+ ValueError
140
+ If the file is not a valid CellFunCon project file.
141
+
142
+ Example
143
+ -------
144
+ >>> self = CellFunCon.load_project('my_project.psc')
145
+ """
146
+
147
+ if ".psc" in filename:
148
+ with open(filename, "rb") as f:
149
+ obj = pickle.load(f)
150
+ if not isinstance(obj, cls):
151
+ raise TypeError("Plik nie zawiera obiektu Project")
152
+ print(f"Projekt wczytany z {filename}")
153
+ return obj
154
+ else:
155
+ raise ValueError("Project not belong to CellFunCon project data.")
156
+
157
+ def calculate_cells_markers(self, min_exp=0, min_pct=0.05, n_proc=10):
158
+ """
159
+ Calculates marker genes for each cell type based on expression thresholds.
160
+
161
+ Perform differential gene expression (DEG) analysis on gene expression data.
162
+
163
+ The function compares groups of cells or samples (defined by `entities` or
164
+ `sets`) using the Mann–Whitney U test. It computes p-values, adjusted
165
+ p-values, fold changes, standardized effect sizes, and other statistics.
166
+
167
+
168
+ Parameters
169
+ ----------
170
+ min_exp : float, optional
171
+ Minimum expression level to consider a gene (default 0).
172
+
173
+ min_pct : float, optional
174
+ Minimum fraction of cells expressing a gene (default 0.05).
175
+
176
+ n_proc : int, optional
177
+ Number of parallel processes to use (default 10).
178
+
179
+ Notes
180
+ -----
181
+ The results are stored in the `cells_markers` attribute.
182
+ """
183
+
184
+ self.jdti.calculate_difference_markers(
185
+ min_exp=min_exp, min_pct=min_pct, n_proc=n_proc, force=True
186
+ )
187
+
188
+ self.cells_markers = self.jdti.var_data
189
+
190
+ def enrich_cells_fucntionality(self, p_value=0.05, log_fc=0.25, top_max=500):
191
+ """
192
+ Performs functional enrichment analysis for each cell type based on marker genes.
193
+
194
+ Parameters
195
+ ----------
196
+ p_value : float, optional
197
+ Maximum adjusted p-value for significant genes (default 0.05).
198
+
199
+ log_fc : float, optional
200
+ Minimum log fold-change threshold for marker genes (default 0.25).
201
+
202
+ top_max : int, optional
203
+ Maximum number of top marker genes per cell type to consider (default 500).
204
+
205
+ Raises
206
+ ------
207
+ ValueError
208
+ If `cells_markers` is not defined.
209
+
210
+ Notes
211
+ -----
212
+ This method populates `cells_enrichment` with results for GO-TERM, KEGG, REACTOME,
213
+ STRING, IntAct, and specificity analyses.
214
+ """
215
+
216
+ if isinstance(self.cells_markers, pd.DataFrame):
217
+
218
+ markers = self.cells_markers
219
+ cells = set(markers["valid_group"])
220
+
221
+ data_dict = {}
222
+
223
+ max_c = len(cells)
224
+ for n, c in enumerate(cells):
225
+ print(f"\nAnalysis {n+1} of {max_c} cells --> {c} \n")
226
+ tmp = markers[
227
+ (markers["valid_group"] == c)
228
+ & (markers["adj_pval"] <= p_value)
229
+ & (markers["log(FC)"] > log_fc)
230
+ ]
231
+ names = list(set(tmp["feature"]))
232
+
233
+ tmp = tmp[tmp["feature"].isin(names)]
234
+
235
+ if len(tmp.index) < 10:
236
+ tmp = markers[
237
+ (markers["valid_group"] == c)
238
+ & (markers["p_val"] <= p_value)
239
+ & (markers["log(FC)"] > log_fc)
240
+ ]
241
+ names = list(set(tmp["feature"]))
242
+
243
+ tmp = tmp[tmp["feature"].isin(names)]
244
+
245
+ tmp = tmp.sort_values("esm", ascending=False).head(top_max)
246
+
247
+ data_dict[c] = {}
248
+ enr = copy.copy(self.enr_full_info)
249
+ enr.genome = enr.genome[
250
+ enr.genome["found_names"].isin(list(set(tmp["feature"])))
251
+ ].reset_index(drop=True)
252
+ enr.enriche_specificiti()
253
+ enr.enriche_KEGG()
254
+ enr.enriche_GOTERM()
255
+ enr.enriche_REACTOME()
256
+ enr.enriche_IntAct()
257
+ enr.enriche_STRING()
258
+ enr.enriche_specificiti()
259
+
260
+ data = enr.get_results()
261
+ del enr
262
+
263
+ ans = Analysis(data)
264
+ ans.gene_interaction()
265
+ ans.features_specificity()
266
+ ans.REACTOME_overrepresentation()
267
+ ans.KEGG_overrepresentation()
268
+ ans.GO_overrepresentation()
269
+ ans.features_specificity()
270
+
271
+ data_dict[c] = ans.get_full_results()
272
+
273
+ self.cells_enrichment = data_dict
274
+
275
+ else:
276
+ raise ValueError(
277
+ "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers."
278
+ )
279
+
280
+ def get_enrichment_data(
281
+ self,
282
+ data_type="GO-TERM",
283
+ p_value=0.05,
284
+ test="FISH",
285
+ adj="BH",
286
+ parent_inc=False,
287
+ top_n=50,
288
+ ):
289
+ """
290
+ Retrieves enrichment results for all cells in a unified DataFrame.
291
+
292
+ Parameters
293
+ ----------
294
+ data_type : str
295
+ Type of enrichment to retrieve ('GO-TERM', 'KEGG', 'REACTOME', 'specificity').
296
+
297
+ p_value : float, optional
298
+ Maximum p-value threshold (default 0.05).
299
+
300
+ test : str, optional
301
+ Name of the statistical test column to use (default 'FISH').
302
+
303
+ adj : str, optional
304
+ P-value adjustment method (default 'BH').
305
+
306
+ parent_inc : bool, optional
307
+ Whether to include parent terms in the results (default False).
308
+
309
+ top_n : int, optional
310
+ Maximum number of terms per cell type to include (default 50).
311
+
312
+ Returns
313
+ -------
314
+ pd.DataFrame
315
+ DataFrame containing filtered enrichment results with a 'cell' column indicating cell type.
316
+
317
+ Raises
318
+ ------
319
+ ValueError
320
+ If `data_type` is not one of the expected values.
321
+ """
322
+
323
+ if not any(
324
+ x in data_type for x in ("GO-TERM", "KEGG", "REACTOME", "specificity")
325
+ ):
326
+ raise ValueError(
327
+ "Invalid value for 'data_type'. Expected: 'GO-TERM', 'KEGG', 'REACTOME' or 'specificity'."
328
+ )
329
+
330
+ if data_type == "GO-TERM":
331
+ parent_col = "parent"
332
+
333
+ elif data_type == "KEGG":
334
+ parent_col = "2nd"
335
+
336
+ elif data_type == "REACTOME":
337
+ parent_col = "top_level"
338
+
339
+ elif data_type == "specificity":
340
+ parent_col = "None"
341
+
342
+ pdl = []
343
+ for i in self.cells_enrichment.keys():
344
+ print(i)
345
+ if data_type == "specificity":
346
+ tmp_dict = self.cells_enrichment[i]["statistics"][data_type]
347
+ tmp = []
348
+ for k in tmp_dict.keys():
349
+ if k != "HPA_subcellular_location":
350
+ tmp.append(pd.DataFrame(tmp_dict[k]))
351
+
352
+ tmp = pd.concat(tmp)
353
+
354
+ else:
355
+ tmp = pd.DataFrame(self.cells_enrichment[i]["statistics"][data_type])
356
+
357
+ cols = [x for x in tmp.columns if test in x and adj in x]
358
+ cols = sorted(cols, reverse=True)
359
+ if parent_inc is False:
360
+ cols = [x for x in cols if parent_col not in x.lower()]
361
+
362
+ mask = (tmp[cols] <= p_value).all(axis=1)
363
+ tmp = tmp.loc[mask]
364
+ tmp["cell"] = i
365
+ tmp = tmp.sort_values(by=["cell"] + cols, ascending=True)
366
+
367
+ pdl.append(tmp.head(top_n))
368
+
369
+ df = pd.concat(pdl)
370
+ df["source"] = data_type
371
+ df = df.reset_index(drop=True)
372
+
373
+ return df
374
+
375
+ def get_included_cells(self):
376
+ """
377
+ Returns the list of cell types included in the enrichment analysis.
378
+
379
+ Returns
380
+ -------
381
+ list
382
+ List of cell type names.
383
+
384
+ Example
385
+ -------
386
+ >>> self.get_included_cells()
387
+ ['CellType1', 'CellType2', ...]
388
+ """
389
+
390
+ cl = []
391
+ for i in self.cells_enrichment.keys():
392
+ print(i)
393
+ cl.append(i)
394
+
395
+ return cl
396
+
397
+ def get_gene_interactions(self, cell_name):
398
+ """
399
+ Retrieves gene or protein interaction data for a specific cell type.
400
+
401
+ Parameters
402
+ ----------
403
+ cell_name : str
404
+ Name of the cell type.
405
+
406
+ Returns
407
+ -------
408
+ pd.DataFrame
409
+ DataFrame containing interactions for the specified cell.
410
+
411
+ Example
412
+ -------
413
+ >>> self.get_gene_interactions('CellType1')
414
+ """
415
+
416
+ tmp = pd.DataFrame(
417
+ self.cells_enrichment[cell_name]["statistics"]["interactions"]
418
+ )
419
+
420
+ return tmp
421
+
422
+ def calculate_cell_connections(self):
423
+ """
424
+ Calculates cell-cell interaction connections based on gene/protein co-expression.
425
+
426
+ Notes
427
+ -----
428
+ Populates `cells_connection` with a DataFrame containing interactions between all pairs of cells.
429
+
430
+ Each row represents an interaction between two cells and the involved genes/proteins.
431
+
432
+ Raises
433
+ ------
434
+ ValueError
435
+ If `normalized_data` is not defined in the JDTI object.
436
+ """
437
+
438
+ if isinstance(self.jdti.normalized_data, pd.DataFrame):
439
+
440
+ cells = set(self.jdti.normalized_data.columns)
441
+
442
+ data_dict = {}
443
+
444
+ for c in tqdm(cells):
445
+
446
+ tmp = self.jdti.normalized_data.loc[:, c]
447
+ names = tmp.loc[
448
+ tmp.select_dtypes(include="number").sum(axis=1) > 0
449
+ ].index.tolist()
450
+ names = list(set(names))
451
+
452
+ enr = copy.copy(self.enr_full_info)
453
+ enr.genome = enr.genome[
454
+ enr.genome["found_names"].isin(names)
455
+ ].reset_index(drop=True)
456
+ enr.enriche_CellCon()
457
+ data = enr.get_results()
458
+ del enr
459
+
460
+ data_dict[c] = data["CellConnections"]
461
+
462
+ full_data = []
463
+ for c1 in tqdm(cells):
464
+ for c2 in cells:
465
+ if c1 != c2:
466
+ c1_d = pd.DataFrame(data_dict[c1]["interactor2"])
467
+ c2_d = pd.DataFrame(data_dict[c2]["interactor1"])
468
+
469
+ mutual_lr = c1_d["interaction"][
470
+ c1_d["interaction"].isin(list(c2_d["interaction"]))
471
+ ]
472
+
473
+ to_ret = (
474
+ c1_d[c1_d["interaction"].isin(list(mutual_lr))]
475
+ .drop(
476
+ [
477
+ "Species",
478
+ "protein_id_1",
479
+ "protein_id_2",
480
+ "found_names_2",
481
+ ],
482
+ axis=1,
483
+ )
484
+ .reset_index(drop=True)
485
+ )
486
+
487
+ to_ret = to_ret.rename(columns={"found_names_1": "interactor1"})
488
+ c2_subset = c2_d[["interaction", "found_names_2"]].rename(
489
+ columns={"found_names_2": "interactor2"}
490
+ )
491
+
492
+ to_ret = to_ret.merge(c2_subset, on="interaction", how="left")
493
+ to_ret["cell1"] = c1
494
+ to_ret["cell2"] = c2
495
+
496
+ full_data.append(to_ret)
497
+
498
+ self.cells_connection = pd.concat(full_data)
499
+
500
+ else:
501
+ raise ValueError(
502
+ "`self.cells_markers` not defined. Use `self.cells_markers` to provide markers."
503
+ )
504
+
505
+ def get_cell_connections(self):
506
+ """
507
+ Returns the calculated cell-cell interaction connections.
508
+
509
+ Returns
510
+ -------
511
+ pd.DataFrame
512
+ DataFrame containing cell-cell interactions.
513
+
514
+ Example
515
+ -------
516
+ >>> connections = self.get_cell_connections()
517
+ """
518
+
519
+ return self.cells_connection
520
+
521
+
522
+ def compare_connections(
523
+ instances_dict: dict,
524
+ cells_compartment: dict | None = None,
525
+ connection_type: list = [
526
+ "Adhesion-Adhesion",
527
+ "Gap-Gap",
528
+ "Ligand-Ligand",
529
+ "Ligand-Receptor",
530
+ "Receptor-Receptor",
531
+ "Undefined",
532
+ ],
533
+ ):
534
+ """
535
+ Compare gene expression between two instances based on their cell connections.
536
+
537
+ This function compares normalized gene expression data from exactly two
538
+ instances stored in ``instances_dict``. Optionally, the comparison can be
539
+ restricted to specific cell compartments for each instance. Differential
540
+ expression analysis is performed using ``jdti.calc_DEG``.
541
+
542
+ Parameters
543
+ ----------
544
+ instances_dict : dict
545
+ Dictionary containing exactly two objects. Each object must have:
546
+
547
+ - ``jdti.normalized_data`` : pandas.DataFrame
548
+ Gene expression matrix with genes as rows and cells as columns.
549
+
550
+ - ``cells_connection`` : pandas.DataFrame
551
+ DataFrame containing at least the columns ``'interactor1'`` and
552
+ ``'interactor2'``.
553
+
554
+ The dictionary keys are used as group labels in the comparison.
555
+
556
+ cells_compartment : dict or None, optional
557
+ Dictionary mapping each key in ``instances_dict`` to a list of cell names
558
+ to be used for the comparison. If ``None``, all cells are used and genes
559
+ are filtered based on cell–cell connections.
560
+
561
+ Returns
562
+ -------
563
+ pandas.DataFrame
564
+ Differential expression results returned by ``calc_DEG``, filtered to
565
+ include only rows where ``valid_group`` matches the first key in
566
+ ``instances_dict``.
567
+
568
+ Raises
569
+ ------
570
+ ValueError
571
+ If any cell specified in ``cells_compartment`` is not present in the
572
+ corresponding ``normalized_data`` columns.
573
+
574
+ Notes
575
+ -----
576
+ - Only genes common to both instances are considered.
577
+
578
+ - When ``cells_compartment`` is ``None``, genes are further restricted to
579
+ those appearing in the cell–cell interaction networks of either instance.
580
+
581
+ - The function assumes exactly two entries in ``instances_dict``.
582
+
583
+ - Differential expression is computed with ``min_exp=0`` and ``min_pct=0.1``.
584
+
585
+ See Also
586
+ --------
587
+ jdti.calc_DEG : Function used to compute differential expression.
588
+ """
589
+
590
+ import pandas as pd
591
+ from jdti import calc_DEG
592
+
593
+ if isinstance(cells_compartment, dict):
594
+
595
+ keys_list = list(instances_dict.keys())
596
+ tmp1 = instances_dict[keys_list[0]].jdti.normalized_data.copy()
597
+ cells = cells_compartment[keys_list[0]]
598
+ if any(cell not in tmp1.columns for cell in cells):
599
+ raise ValueError(
600
+ 'Any of {keys_list[0]} cells in dictionary "cells_compartment" do not occur!'
601
+ )
602
+ tmp1 = tmp1.loc[:, cells]
603
+ tmp1.columns = [keys_list[0]] * len(tmp1.columns)
604
+
605
+ tmp2 = instances_dict[keys_list[1]].jdti.normalized_data.copy()
606
+ cells = cells_compartment[keys_list[1]]
607
+ if any(cell not in tmp2.columns for cell in cells):
608
+ raise ValueError(
609
+ 'Any of {keys_list[1]} cells in dictionary "cells_compartment" do not occur!'
610
+ )
611
+ tmp2 = tmp2.loc[:, cells]
612
+ tmp2.columns = [keys_list[1]] * len(tmp2.columns)
613
+
614
+ common_idx = tmp1.index.intersection(tmp2.index)
615
+
616
+ tmp1 = tmp1.loc[common_idx]
617
+ tmp2 = tmp2.loc[common_idx]
618
+
619
+ concat_df = pd.concat([tmp1, tmp2], axis=1)
620
+
621
+ else:
622
+
623
+ keys_list = list(instances_dict.keys())
624
+ tmp1 = instances_dict[keys_list[0]].jdti.normalized_data.copy()
625
+ tmp1.columns = [keys_list[0]] * len(tmp1.columns)
626
+
627
+ tmp2 = instances_dict[keys_list[1]].jdti.normalized_data.copy()
628
+ tmp2.columns = [keys_list[1]] * len(tmp2.columns)
629
+
630
+ common_idx = tmp1.index.intersection(tmp2.index)
631
+
632
+ tmp1 = tmp1.loc[common_idx]
633
+ tmp2 = tmp2.loc[common_idx]
634
+
635
+ concat_df = pd.concat([tmp1, tmp2], axis=1)
636
+
637
+ tmp_df_1 = instances_dict[keys_list[0]].cells_connection
638
+ tmp_df_2 = instances_dict[keys_list[1]].cells_connection
639
+
640
+ tmp_df_1["directionality"] = [
641
+ x if x is not None else "Undefined" for x in tmp_df_1["directionality"]
642
+ ]
643
+ tmp_df_2["directionality"] = [
644
+ x if x is not None else "Undefined" for x in tmp_df_2["directionality"]
645
+ ]
646
+
647
+ tmp_df_1 = tmp_df_1[tmp_df_1["directionality"].isin(connection_type)]
648
+ tmp_df_2 = tmp_df_2[tmp_df_2["directionality"].isin(connection_type)]
649
+
650
+ tmp_con1 = list(set(list(tmp_df_1["interactor1"]) + list(tmp_df_1["interactor2"])))
651
+
652
+ tmp_con2 = list(set(list(tmp_df_2["interactor1"]) + list(tmp_df_2["interactor2"])))
653
+
654
+ genes = list(set(tmp_con1 + tmp_con2))
655
+
656
+ genes2 = [x for x in genes if x in common_idx]
657
+
658
+ concat_df = concat_df.loc[genes2, :]
659
+
660
+ results = calc_DEG(
661
+ data=concat_df,
662
+ metadata_list=None,
663
+ entities="All",
664
+ sets=None,
665
+ min_exp=0,
666
+ min_pct=0,
667
+ n_proc=10,
668
+ )
669
+
670
+ results = results[results["valid_group"] == keys_list[0]]
671
+
672
+ return results