risk-network 0.0.7b11__tar.gz → 0.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {risk_network-0.0.7b11 → risk_network-0.0.8}/PKG-INFO +84 -21
- risk_network-0.0.8/README.md +102 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/__init__.py +1 -1
- risk_network-0.0.8/risk/annotations/__init__.py +7 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/annotations/annotations.py +86 -54
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/annotations/io.py +14 -14
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/log/__init__.py +1 -1
- risk_network-0.0.8/risk/log/console.py +139 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/log/params.py +6 -6
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/neighborhoods/community.py +68 -61
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/neighborhoods/domains.py +43 -20
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/neighborhoods/neighborhoods.py +136 -71
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/network/geometry.py +5 -2
- risk_network-0.0.8/risk/network/graph.py +219 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/network/io.py +56 -18
- risk_network-0.0.8/risk/network/plot/__init__.py +6 -0
- risk_network-0.0.8/risk/network/plot/canvas.py +290 -0
- risk_network-0.0.8/risk/network/plot/contour.py +327 -0
- risk_network-0.0.8/risk/network/plot/labels.py +929 -0
- risk_network-0.0.8/risk/network/plot/network.py +288 -0
- risk_network-0.0.8/risk/network/plot/plotter.py +137 -0
- risk_network-0.0.8/risk/network/plot/utils/color.py +424 -0
- risk_network-0.0.8/risk/network/plot/utils/layout.py +91 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/risk.py +84 -58
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/stats/hypergeom.py +1 -1
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/stats/permutation/permutation.py +21 -8
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/stats/poisson.py +2 -2
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/stats/stats.py +12 -10
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk_network.egg-info/PKG-INFO +84 -21
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk_network.egg-info/SOURCES.txt +9 -2
- risk_network-0.0.7b11/README.md +0 -39
- risk_network-0.0.7b11/risk/annotations/__init__.py +0 -7
- risk_network-0.0.7b11/risk/log/config.py +0 -48
- risk_network-0.0.7b11/risk/network/graph.py +0 -385
- risk_network-0.0.7b11/risk/network/plot.py +0 -1343
- {risk_network-0.0.7b11 → risk_network-0.0.8}/LICENSE +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/MANIFEST.in +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/pyproject.toml +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/constants.py +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/neighborhoods/__init__.py +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/network/__init__.py +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/stats/__init__.py +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/stats/permutation/__init__.py +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk/stats/permutation/test_functions.py +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk_network.egg-info/dependency_links.txt +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk_network.egg-info/requires.txt +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/risk_network.egg-info/top_level.txt +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/setup.cfg +0 -0
- {risk_network-0.0.7b11 → risk_network-0.0.8}/setup.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: risk-network
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.8
|
4
4
|
Summary: A Python package for biological network analysis
|
5
5
|
Author: Ira Horecka
|
6
6
|
Author-email: Ira Horecka <ira89@icloud.com>
|
@@ -709,42 +709,105 @@ Requires-Dist: statsmodels
|
|
709
709
|
Requires-Dist: threadpoolctl
|
710
710
|
Requires-Dist: tqdm
|
711
711
|
|
712
|
-
|
713
|
-
<img src="https://i.imgur.com/Fo9EmnK.png" width="400" />
|
714
|
-
</p>
|
712
|
+
# RISK Network
|
715
713
|
|
716
714
|
<p align="center">
|
717
|
-
<
|
718
|
-
<a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.8+-blue.svg" alt="Python 3.8+"></a>
|
719
|
-
<a href="https://raw.githubusercontent.com/irahorecka/chrono24/main/LICENSE"><img src="https://img.shields.io/badge/License-GPLv3-blue.svg" alt="License: GPL v3"></a>
|
715
|
+
<img src="https://i.imgur.com/8TleEJs.png" width="50%" />
|
720
716
|
</p>
|
721
717
|
|
722
|
-
|
718
|
+
<br>
|
719
|
+
|
720
|
+

|
721
|
+
[](https://pypi.python.org/pypi/risk-network)
|
722
|
+

|
723
|
+
[](https://doi.org/10.5281/zenodo.xxxxxxx)
|
724
|
+

|
725
|
+

|
726
|
+
|
727
|
+
**RISK (RISK Infers Spatial Kinships)** is a next-generation tool designed to streamline the analysis of biological and non-biological networks. RISK enhances network analysis with its modular architecture, extensive file format support, and advanced clustering algorithms. It simplifies the creation of publication-quality figures, making it an important tool for researchers across disciplines.
|
723
728
|
|
724
|
-
|
729
|
+
## Documentation and Tutorial
|
730
|
+
|
731
|
+
- **Documentation**: Comprehensive documentation is available [here](Documentation link).
|
732
|
+
- **Tutorial**: An interactive Jupyter notebook tutorial can be found [here](https://github.com/riskportal/network-tutorial).
|
733
|
+
We highly recommend new users to consult the documentation and tutorial early on to fully leverage RISK's capabilities.
|
734
|
+
|
735
|
+
## Installation
|
725
736
|
|
726
|
-
RISK is
|
737
|
+
RISK is compatible with Python 3.8 and later versions and operates on all major operating systems. Install RISK via pip:
|
738
|
+
|
739
|
+
```bash
|
740
|
+
pip install risk-network
|
741
|
+
```
|
727
742
|
|
728
743
|
## Features
|
729
744
|
|
730
|
-
-
|
731
|
-
-
|
732
|
-
-
|
745
|
+
- **Comprehensive Network Analysis**: Analyze biological networks such as protein–protein interaction (PPI) and gene regulatory networks, as well as non-biological networks.
|
746
|
+
- **Advanced Clustering Algorithms**: Utilize algorithms like Louvain, Markov Clustering, Spinglass, and more to identify key functional modules.
|
747
|
+
- **Flexible Visualization**: Generate clear, publication-quality figures with customizable node and edge attributes, including colors, shapes, sizes, and labels.
|
748
|
+
- **Efficient Data Handling**: Optimized for large datasets, supporting multiple file formats such as JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
|
749
|
+
- **Statistical Analysis**: Integrated statistical tests, including hypergeometric, permutation, and Poisson tests, to assess the significance of enriched regions.
|
750
|
+
- **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
|
733
751
|
|
734
|
-
## Example
|
752
|
+
## Example Usage
|
735
753
|
|
736
|
-
*Saccharomyces cerevisiae*
|
754
|
+
We applied RISK to a *Saccharomyces cerevisiae* protein–protein interaction network, revealing both established and novel functional relationships. The visualization below highlights key biological processes such as ribosomal assembly and mitochondrial organization.
|
737
755
|
|
738
|
-

|
739
757
|
|
740
|
-
|
758
|
+
RISK successfully detected both known and novel functional clusters within the yeast interactome. Clusters related to Golgi transport and actin nucleation were clearly defined and closely located, showcasing RISK's ability to map well-characterized interactions. Additionally, RISK identified links between mRNA processing pathways and vesicle trafficking proteins, consistent with recent studies demonstrating the role of vesicles in mRNA localization and stability.
|
759
|
+
|
760
|
+
## Citation
|
761
|
+
|
762
|
+
If you use RISK in your research, please cite the following:
|
763
|
+
|
764
|
+
**Horecka**, *et al.*, "RISK: a next-generation tool for biological network annotation and visualization", **[Journal Name]**, 2024. DOI: [10.1234/zenodo.xxxxxxx](https://doi.org/10.1234/zenodo.xxxxxxx)
|
765
|
+
|
766
|
+
## Software Architecture and Implementation
|
741
767
|
|
742
|
-
|
768
|
+
RISK features a streamlined, modular architecture designed to meet diverse research needs. Each module focuses on a specific task—such as network input/output, statistical analysis, or visualization—ensuring ease of adaptation and extension. This design enhances flexibility and reduces development overhead for users integrating RISK into their workflows.
|
743
769
|
|
744
|
-
|
770
|
+
### Supported Data Formats
|
745
771
|
|
746
|
-
|
772
|
+
- **Input/Output**: JSON, CSV, TSV, Excel, Cytoscape, GPickle.
|
773
|
+
- **Visualization Outputs**: SVG, PNG, PDF.
|
774
|
+
|
775
|
+
### Clustering Algorithms
|
776
|
+
|
777
|
+
- **Available Algorithms**:
|
778
|
+
- Greedy Modularity
|
779
|
+
- Label Propagation
|
780
|
+
- Louvain
|
781
|
+
- Markov Clustering
|
782
|
+
- Spinglass
|
783
|
+
- Walktrap
|
784
|
+
- **Distance Metrics**: Supports both spherical and Euclidean distance metrics.
|
785
|
+
|
786
|
+
### Statistical Tests
|
787
|
+
|
788
|
+
- **Hypergeometric Test**
|
789
|
+
- **Permutation Test** (single- or multi-process modes)
|
790
|
+
- **Poisson Test**
|
791
|
+
|
792
|
+
## Performance and Efficiency
|
793
|
+
|
794
|
+
In benchmarking tests using the yeast interactome network, RISK demonstrated substantial improvements over previous tools in both computational performance and memory efficiency. RISK processed the dataset approximately **3.25 times faster**, reducing CPU time by **69%**, and required **25% less peak memory usage**, underscoring its efficient utilization of computational resources.
|
795
|
+
|
796
|
+
## Contributing
|
797
|
+
|
798
|
+
We welcome contributions from the community. Please use the following resources:
|
799
|
+
|
800
|
+
- [Issues Tracker](https://github.com/irahorecka/risk/issues)
|
801
|
+
- [Source Code](https://github.com/irahorecka/risk/tree/main/risk)
|
802
|
+
|
803
|
+
## Support
|
804
|
+
|
805
|
+
If you encounter issues or have suggestions for new features, please use the [Issues Tracker](https://github.com/irahorecka/risk/issues) on GitHub.
|
747
806
|
|
748
807
|
## License
|
749
808
|
|
750
|
-
|
809
|
+
RISK is freely available as open-source software under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
810
|
+
|
811
|
+
---
|
812
|
+
|
813
|
+
**Note**: For detailed documentation and to access the interactive tutorial, please visit the links provided in the [Documentation and Tutorial](#documentation-and-tutorial) section.
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# RISK Network
|
2
|
+
|
3
|
+
<p align="center">
|
4
|
+
<img src="https://i.imgur.com/8TleEJs.png" width="50%" />
|
5
|
+
</p>
|
6
|
+
|
7
|
+
<br>
|
8
|
+
|
9
|
+

|
10
|
+
[](https://pypi.python.org/pypi/risk-network)
|
11
|
+

|
12
|
+
[](https://doi.org/10.5281/zenodo.xxxxxxx)
|
13
|
+

|
14
|
+

|
15
|
+
|
16
|
+
**RISK (RISK Infers Spatial Kinships)** is a next-generation tool designed to streamline the analysis of biological and non-biological networks. RISK enhances network analysis with its modular architecture, extensive file format support, and advanced clustering algorithms. It simplifies the creation of publication-quality figures, making it an important tool for researchers across disciplines.
|
17
|
+
|
18
|
+
## Documentation and Tutorial
|
19
|
+
|
20
|
+
- **Documentation**: Comprehensive documentation is available [here](Documentation link).
|
21
|
+
- **Tutorial**: An interactive Jupyter notebook tutorial can be found [here](https://github.com/riskportal/network-tutorial).
|
22
|
+
We highly recommend new users to consult the documentation and tutorial early on to fully leverage RISK's capabilities.
|
23
|
+
|
24
|
+
## Installation
|
25
|
+
|
26
|
+
RISK is compatible with Python 3.8 and later versions and operates on all major operating systems. Install RISK via pip:
|
27
|
+
|
28
|
+
```bash
|
29
|
+
pip install risk-network
|
30
|
+
```
|
31
|
+
|
32
|
+
## Features
|
33
|
+
|
34
|
+
- **Comprehensive Network Analysis**: Analyze biological networks such as protein–protein interaction (PPI) and gene regulatory networks, as well as non-biological networks.
|
35
|
+
- **Advanced Clustering Algorithms**: Utilize algorithms like Louvain, Markov Clustering, Spinglass, and more to identify key functional modules.
|
36
|
+
- **Flexible Visualization**: Generate clear, publication-quality figures with customizable node and edge attributes, including colors, shapes, sizes, and labels.
|
37
|
+
- **Efficient Data Handling**: Optimized for large datasets, supporting multiple file formats such as JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
|
38
|
+
- **Statistical Analysis**: Integrated statistical tests, including hypergeometric, permutation, and Poisson tests, to assess the significance of enriched regions.
|
39
|
+
- **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
|
40
|
+
|
41
|
+
## Example Usage
|
42
|
+
|
43
|
+
We applied RISK to a *Saccharomyces cerevisiae* protein–protein interaction network, revealing both established and novel functional relationships. The visualization below highlights key biological processes such as ribosomal assembly and mitochondrial organization.
|
44
|
+
|
45
|
+

|
46
|
+
|
47
|
+
RISK successfully detected both known and novel functional clusters within the yeast interactome. Clusters related to Golgi transport and actin nucleation were clearly defined and closely located, showcasing RISK's ability to map well-characterized interactions. Additionally, RISK identified links between mRNA processing pathways and vesicle trafficking proteins, consistent with recent studies demonstrating the role of vesicles in mRNA localization and stability.
|
48
|
+
|
49
|
+
## Citation
|
50
|
+
|
51
|
+
If you use RISK in your research, please cite the following:
|
52
|
+
|
53
|
+
**Horecka**, *et al.*, "RISK: a next-generation tool for biological network annotation and visualization", **[Journal Name]**, 2024. DOI: [10.1234/zenodo.xxxxxxx](https://doi.org/10.1234/zenodo.xxxxxxx)
|
54
|
+
|
55
|
+
## Software Architecture and Implementation
|
56
|
+
|
57
|
+
RISK features a streamlined, modular architecture designed to meet diverse research needs. Each module focuses on a specific task—such as network input/output, statistical analysis, or visualization—ensuring ease of adaptation and extension. This design enhances flexibility and reduces development overhead for users integrating RISK into their workflows.
|
58
|
+
|
59
|
+
### Supported Data Formats
|
60
|
+
|
61
|
+
- **Input/Output**: JSON, CSV, TSV, Excel, Cytoscape, GPickle.
|
62
|
+
- **Visualization Outputs**: SVG, PNG, PDF.
|
63
|
+
|
64
|
+
### Clustering Algorithms
|
65
|
+
|
66
|
+
- **Available Algorithms**:
|
67
|
+
- Greedy Modularity
|
68
|
+
- Label Propagation
|
69
|
+
- Louvain
|
70
|
+
- Markov Clustering
|
71
|
+
- Spinglass
|
72
|
+
- Walktrap
|
73
|
+
- **Distance Metrics**: Supports both spherical and Euclidean distance metrics.
|
74
|
+
|
75
|
+
### Statistical Tests
|
76
|
+
|
77
|
+
- **Hypergeometric Test**
|
78
|
+
- **Permutation Test** (single- or multi-process modes)
|
79
|
+
- **Poisson Test**
|
80
|
+
|
81
|
+
## Performance and Efficiency
|
82
|
+
|
83
|
+
In benchmarking tests using the yeast interactome network, RISK demonstrated substantial improvements over previous tools in both computational performance and memory efficiency. RISK processed the dataset approximately **3.25 times faster**, reducing CPU time by **69%**, and required **25% less peak memory usage**, underscoring its efficient utilization of computational resources.
|
84
|
+
|
85
|
+
## Contributing
|
86
|
+
|
87
|
+
We welcome contributions from the community. Please use the following resources:
|
88
|
+
|
89
|
+
- [Issues Tracker](https://github.com/irahorecka/risk/issues)
|
90
|
+
- [Source Code](https://github.com/irahorecka/risk/tree/main/risk)
|
91
|
+
|
92
|
+
## Support
|
93
|
+
|
94
|
+
If you encounter issues or have suggestions for new features, please use the [Issues Tracker](https://github.com/irahorecka/risk/issues) on GitHub.
|
95
|
+
|
96
|
+
## License
|
97
|
+
|
98
|
+
RISK is freely available as open-source software under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
99
|
+
|
100
|
+
---
|
101
|
+
|
102
|
+
**Note**: For detailed documentation and to access the interactive tutorial, please visit the links provided in the [Documentation and Tutorial](#documentation-and-tutorial) section.
|
@@ -3,8 +3,9 @@ risk/annotations/annotations
|
|
3
3
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
4
|
"""
|
5
5
|
|
6
|
+
import re
|
6
7
|
from collections import Counter
|
7
|
-
from itertools import compress
|
8
|
+
from itertools import compress
|
8
9
|
from typing import Any, Dict, List, Set
|
9
10
|
|
10
11
|
import networkx as nx
|
@@ -30,27 +31,30 @@ def _setup_nltk():
|
|
30
31
|
|
31
32
|
# Ensure you have the necessary NLTK data
|
32
33
|
_setup_nltk()
|
34
|
+
# Initialize English stopwords
|
35
|
+
stop_words = set(stopwords.words("english"))
|
33
36
|
|
34
37
|
|
35
38
|
def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Dict[str, Any]:
|
36
39
|
"""Convert annotations input to a DataFrame and reindex based on the network's node labels.
|
37
40
|
|
38
41
|
Args:
|
39
|
-
|
42
|
+
network (nx.Graph): The network graph.
|
43
|
+
annotations_input (Dict[str, Any]): A dictionary with annotations.
|
40
44
|
|
41
45
|
Returns:
|
42
|
-
|
46
|
+
Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the binary annotations matrix.
|
43
47
|
"""
|
44
48
|
# Flatten the dictionary to a list of tuples for easier DataFrame creation
|
45
49
|
flattened_annotations = [
|
46
50
|
(node, annotation) for annotation, nodes in annotations_input.items() for node in nodes
|
47
51
|
]
|
48
52
|
# Create a DataFrame from the flattened list
|
49
|
-
annotations = pd.DataFrame(flattened_annotations, columns=["
|
50
|
-
annotations["
|
53
|
+
annotations = pd.DataFrame(flattened_annotations, columns=["node", "annotations"])
|
54
|
+
annotations["is_member"] = 1
|
51
55
|
# Pivot to create a binary matrix with nodes as rows and annotations as columns
|
52
56
|
annotations_pivot = annotations.pivot_table(
|
53
|
-
index="
|
57
|
+
index="node", columns="annotations", values="is_member", fill_value=0, dropna=False
|
54
58
|
)
|
55
59
|
# Reindex the annotations matrix based on the node labels from the network
|
56
60
|
node_label_order = list(nx.get_node_attributes(network, "label").values())
|
@@ -80,7 +84,8 @@ def define_top_annotations(
|
|
80
84
|
network: nx.Graph,
|
81
85
|
ordered_annotation_labels: List[str],
|
82
86
|
neighborhood_enrichment_sums: List[int],
|
83
|
-
|
87
|
+
significant_enrichment_matrix: np.ndarray,
|
88
|
+
significant_binary_enrichment_matrix: np.ndarray,
|
84
89
|
min_cluster_size: int = 5,
|
85
90
|
max_cluster_size: int = 1000,
|
86
91
|
) -> pd.DataFrame:
|
@@ -90,42 +95,52 @@ def define_top_annotations(
|
|
90
95
|
network (NetworkX graph): The network graph.
|
91
96
|
ordered_annotation_labels (list of str): List of ordered annotation labels.
|
92
97
|
neighborhood_enrichment_sums (list of int): List of neighborhood enrichment sums.
|
93
|
-
|
98
|
+
significant_enrichment_matrix (np.ndarray): Enrichment matrix below alpha threshold.
|
99
|
+
significant_binary_enrichment_matrix (np.ndarray): Binary enrichment matrix below alpha threshold.
|
94
100
|
min_cluster_size (int, optional): Minimum cluster size. Defaults to 5.
|
95
101
|
max_cluster_size (int, optional): Maximum cluster size. Defaults to 1000.
|
96
102
|
|
97
103
|
Returns:
|
98
104
|
pd.DataFrame: DataFrame with top annotations and their properties.
|
99
105
|
"""
|
100
|
-
#
|
106
|
+
# Sum the columns of the significant enrichment matrix (positive floating point values)
|
107
|
+
significant_enrichment_scores = significant_enrichment_matrix.sum(axis=0)
|
108
|
+
# Create DataFrame to store annotations, their neighborhood enrichment sums, and enrichment scores
|
101
109
|
annotations_enrichment_matrix = pd.DataFrame(
|
102
110
|
{
|
103
111
|
"id": range(len(ordered_annotation_labels)),
|
104
|
-
"
|
105
|
-
"
|
112
|
+
"full_terms": ordered_annotation_labels,
|
113
|
+
"significant_neighborhood_enrichment_sums": neighborhood_enrichment_sums,
|
114
|
+
"significant_enrichment_score": significant_enrichment_scores,
|
106
115
|
}
|
107
116
|
)
|
108
|
-
annotations_enrichment_matrix["
|
109
|
-
# Apply size constraints to identify potential
|
117
|
+
annotations_enrichment_matrix["significant_annotations"] = False
|
118
|
+
# Apply size constraints to identify potential significant annotations
|
110
119
|
annotations_enrichment_matrix.loc[
|
111
|
-
(
|
112
|
-
|
113
|
-
|
120
|
+
(
|
121
|
+
annotations_enrichment_matrix["significant_neighborhood_enrichment_sums"]
|
122
|
+
>= min_cluster_size
|
123
|
+
)
|
124
|
+
& (
|
125
|
+
annotations_enrichment_matrix["significant_neighborhood_enrichment_sums"]
|
126
|
+
<= max_cluster_size
|
127
|
+
),
|
128
|
+
"significant_annotations",
|
114
129
|
] = True
|
115
130
|
# Initialize columns for connected components analysis
|
116
|
-
annotations_enrichment_matrix["
|
117
|
-
annotations_enrichment_matrix["
|
118
|
-
annotations_enrichment_matrix["
|
119
|
-
"
|
131
|
+
annotations_enrichment_matrix["num_connected_components"] = 0
|
132
|
+
annotations_enrichment_matrix["size_connected_components"] = None
|
133
|
+
annotations_enrichment_matrix["size_connected_components"] = annotations_enrichment_matrix[
|
134
|
+
"size_connected_components"
|
120
135
|
].astype(object)
|
121
|
-
annotations_enrichment_matrix["
|
136
|
+
annotations_enrichment_matrix["num_large_connected_components"] = 0
|
122
137
|
|
123
138
|
for attribute in annotations_enrichment_matrix.index.values[
|
124
|
-
annotations_enrichment_matrix["
|
139
|
+
annotations_enrichment_matrix["significant_annotations"]
|
125
140
|
]:
|
126
141
|
# Identify enriched neighborhoods based on the binary enrichment matrix
|
127
142
|
enriched_neighborhoods = list(
|
128
|
-
compress(list(network),
|
143
|
+
compress(list(network), significant_binary_enrichment_matrix[:, attribute])
|
129
144
|
)
|
130
145
|
enriched_network = nx.subgraph(network, enriched_neighborhoods)
|
131
146
|
# Analyze connected components within the enriched subnetwork
|
@@ -144,57 +159,74 @@ def define_top_annotations(
|
|
144
159
|
num_large_connected_components = len(filtered_size_connected_components)
|
145
160
|
|
146
161
|
# Assign the number of connected components
|
147
|
-
annotations_enrichment_matrix.loc[attribute, "
|
162
|
+
annotations_enrichment_matrix.loc[attribute, "num_connected_components"] = (
|
148
163
|
num_connected_components
|
149
164
|
)
|
150
165
|
# Filter out attributes with more than one connected component
|
151
166
|
annotations_enrichment_matrix.loc[
|
152
|
-
annotations_enrichment_matrix["
|
167
|
+
annotations_enrichment_matrix["num_connected_components"] > 1, "significant_annotations"
|
153
168
|
] = False
|
154
169
|
# Assign the number of large connected components
|
155
|
-
annotations_enrichment_matrix.loc[attribute, "
|
170
|
+
annotations_enrichment_matrix.loc[attribute, "num_large_connected_components"] = (
|
156
171
|
num_large_connected_components
|
157
172
|
)
|
158
173
|
# Assign the size of connected components, ensuring it is always a list
|
159
|
-
annotations_enrichment_matrix.at[attribute, "
|
174
|
+
annotations_enrichment_matrix.at[attribute, "size_connected_components"] = (
|
160
175
|
filtered_size_connected_components.tolist()
|
161
176
|
)
|
162
177
|
|
163
178
|
return annotations_enrichment_matrix
|
164
179
|
|
165
180
|
|
166
|
-
def
|
167
|
-
"""
|
168
|
-
|
181
|
+
def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
|
182
|
+
"""Generate a weighted description from words and their corresponding scores,
|
183
|
+
with support for stopwords filtering and improved weighting logic.
|
169
184
|
|
170
185
|
Args:
|
171
186
|
words_column (pd.Series): A pandas Series containing strings to process.
|
187
|
+
scores_column (pd.Series): A pandas Series containing enrichment scores to weigh the terms.
|
172
188
|
|
173
189
|
Returns:
|
174
|
-
str: A coherent description formed from the most frequent and significant words.
|
190
|
+
str: A coherent description formed from the most frequent and significant words, weighed by enrichment scores.
|
175
191
|
"""
|
176
|
-
#
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
192
|
+
# Handle case where all scores are the same
|
193
|
+
if scores_column.max() == scores_column.min():
|
194
|
+
normalized_scores = pd.Series([1] * len(scores_column))
|
195
|
+
else:
|
196
|
+
# Normalize the enrichment scores to be between 0 and 1
|
197
|
+
normalized_scores = (scores_column - scores_column.min()) / (
|
198
|
+
scores_column.max() - scores_column.min()
|
199
|
+
)
|
200
|
+
|
201
|
+
# Combine words and normalized scores to create weighted words
|
202
|
+
weighted_words = []
|
203
|
+
for word, score in zip(words_column, normalized_scores):
|
204
|
+
word = str(word)
|
205
|
+
if word not in stop_words: # Skip stopwords
|
206
|
+
weight = max(1, int((0 if pd.isna(score) else score) * 10))
|
207
|
+
weighted_words.extend([word] * weight)
|
208
|
+
|
209
|
+
# Tokenize the weighted words, but preserve number-word patterns like '4-alpha'
|
210
|
+
tokens = word_tokenize(" ".join(weighted_words))
|
211
|
+
# Ensure we treat "4-alpha" or other "number-word" patterns as single tokens
|
212
|
+
combined_tokens = []
|
213
|
+
for token in tokens:
|
214
|
+
# Match patterns like '4-alpha' or '5-hydroxy' and keep them together
|
215
|
+
if re.match(r"^\d+-\w+", token):
|
216
|
+
combined_tokens.append(token)
|
217
|
+
elif token.replace(".", "", 1).isdigit(): # Handle pure numeric tokens
|
218
|
+
# Ignore pure numbers as descriptions unless necessary
|
219
|
+
continue
|
220
|
+
else:
|
221
|
+
combined_tokens.append(token)
|
222
|
+
|
223
|
+
# Prevent descriptions like just '4' from being selected
|
224
|
+
if len(combined_tokens) == 1 and combined_tokens[0].isdigit():
|
225
|
+
return "N/A" # Return "N/A" for cases where it's just a number
|
226
|
+
|
227
|
+
# Simplify the word list and generate the description
|
228
|
+
simplified_words = _simplify_word_list(combined_tokens)
|
229
|
+
description = _generate_coherent_description(simplified_words)
|
198
230
|
|
199
231
|
return description
|
200
232
|
|
@@ -257,7 +289,7 @@ def _generate_coherent_description(words: List[str]) -> str:
|
|
257
289
|
If there is only one unique entry, return it directly.
|
258
290
|
|
259
291
|
Args:
|
260
|
-
words (
|
292
|
+
words (List): A list of words or numerical string values.
|
261
293
|
|
262
294
|
Returns:
|
263
295
|
str: A coherent description formed by arranging the words in a logical sequence.
|
@@ -25,15 +25,15 @@ class AnnotationsIO:
|
|
25
25
|
def __init__(self):
|
26
26
|
pass
|
27
27
|
|
28
|
-
def load_json_annotation(self,
|
28
|
+
def load_json_annotation(self, network: nx.Graph, filepath: str) -> Dict[str, Any]:
|
29
29
|
"""Load annotations from a JSON file and convert them to a DataFrame.
|
30
30
|
|
31
31
|
Args:
|
32
|
-
filepath (str): Path to the JSON annotations file.
|
33
32
|
network (NetworkX graph): The network to which the annotations are related.
|
33
|
+
filepath (str): Path to the JSON annotations file.
|
34
34
|
|
35
35
|
Returns:
|
36
|
-
|
36
|
+
Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the annotations matrix.
|
37
37
|
"""
|
38
38
|
filetype = "JSON"
|
39
39
|
# Log the loading of the JSON file
|
@@ -49,8 +49,8 @@ class AnnotationsIO:
|
|
49
49
|
|
50
50
|
def load_excel_annotation(
|
51
51
|
self,
|
52
|
-
filepath: str,
|
53
52
|
network: nx.Graph,
|
53
|
+
filepath: str,
|
54
54
|
label_colname: str = "label",
|
55
55
|
nodes_colname: str = "nodes",
|
56
56
|
sheet_name: str = "Sheet1",
|
@@ -59,8 +59,8 @@ class AnnotationsIO:
|
|
59
59
|
"""Load annotations from an Excel file and associate them with the network.
|
60
60
|
|
61
61
|
Args:
|
62
|
-
filepath (str): Path to the Excel annotations file.
|
63
62
|
network (nx.Graph): The NetworkX graph to which the annotations are related.
|
63
|
+
filepath (str): Path to the Excel annotations file.
|
64
64
|
label_colname (str): Name of the column containing the labels (e.g., GO terms).
|
65
65
|
nodes_colname (str): Name of the column containing the nodes associated with each label.
|
66
66
|
sheet_name (str, optional): The name of the Excel sheet to load (default is 'Sheet1').
|
@@ -87,8 +87,8 @@ class AnnotationsIO:
|
|
87
87
|
|
88
88
|
def load_csv_annotation(
|
89
89
|
self,
|
90
|
-
filepath: str,
|
91
90
|
network: nx.Graph,
|
91
|
+
filepath: str,
|
92
92
|
label_colname: str = "label",
|
93
93
|
nodes_colname: str = "nodes",
|
94
94
|
nodes_delimiter: str = ";",
|
@@ -96,8 +96,8 @@ class AnnotationsIO:
|
|
96
96
|
"""Load annotations from a CSV file and associate them with the network.
|
97
97
|
|
98
98
|
Args:
|
99
|
-
filepath (str): Path to the CSV annotations file.
|
100
99
|
network (nx.Graph): The NetworkX graph to which the annotations are related.
|
100
|
+
filepath (str): Path to the CSV annotations file.
|
101
101
|
label_colname (str): Name of the column containing the labels (e.g., GO terms).
|
102
102
|
nodes_colname (str): Name of the column containing the nodes associated with each label.
|
103
103
|
nodes_delimiter (str, optional): Delimiter used to separate multiple nodes within the nodes column (default is ';').
|
@@ -121,8 +121,8 @@ class AnnotationsIO:
|
|
121
121
|
|
122
122
|
def load_tsv_annotation(
|
123
123
|
self,
|
124
|
-
filepath: str,
|
125
124
|
network: nx.Graph,
|
125
|
+
filepath: str,
|
126
126
|
label_colname: str = "label",
|
127
127
|
nodes_colname: str = "nodes",
|
128
128
|
nodes_delimiter: str = ";",
|
@@ -130,8 +130,8 @@ class AnnotationsIO:
|
|
130
130
|
"""Load annotations from a TSV file and associate them with the network.
|
131
131
|
|
132
132
|
Args:
|
133
|
-
filepath (str): Path to the TSV annotations file.
|
134
133
|
network (nx.Graph): The NetworkX graph to which the annotations are related.
|
134
|
+
filepath (str): Path to the TSV annotations file.
|
135
135
|
label_colname (str): Name of the column containing the labels (e.g., GO terms).
|
136
136
|
nodes_colname (str): Name of the column containing the nodes associated with each label.
|
137
137
|
nodes_delimiter (str, optional): Delimiter used to separate multiple nodes within the nodes column (default is ';').
|
@@ -153,15 +153,15 @@ class AnnotationsIO:
|
|
153
153
|
# Load the annotations into the provided network
|
154
154
|
return load_annotations(network, annotations_input)
|
155
155
|
|
156
|
-
def load_dict_annotation(self, content: Dict[str, Any]
|
156
|
+
def load_dict_annotation(self, network: nx.Graph, content: Dict[str, Any]) -> Dict[str, Any]:
|
157
157
|
"""Load annotations from a provided dictionary and convert them to a dictionary annotation.
|
158
158
|
|
159
159
|
Args:
|
160
|
-
content (dict): The annotations dictionary to load.
|
161
160
|
network (NetworkX graph): The network to which the annotations are related.
|
161
|
+
content (Dict[str, Any]): The annotations dictionary to load.
|
162
162
|
|
163
163
|
Returns:
|
164
|
-
|
164
|
+
Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the annotations matrix.
|
165
165
|
"""
|
166
166
|
# Ensure the input content is a dictionary
|
167
167
|
if not isinstance(content, dict):
|
@@ -219,6 +219,6 @@ def _log_loading(filetype: str, filepath: str = "") -> None:
|
|
219
219
|
filepath (str, optional): The path to the file being loaded.
|
220
220
|
"""
|
221
221
|
log_header("Loading annotations")
|
222
|
-
logger.
|
222
|
+
logger.debug(f"Filetype: {filetype}")
|
223
223
|
if filepath:
|
224
|
-
logger.
|
224
|
+
logger.debug(f"Filepath: {filepath}")
|