risk-network 0.0.7b12__tar.gz → 0.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {risk_network-0.0.7b12 → risk_network-0.0.8}/PKG-INFO +84 -21
- risk_network-0.0.8/README.md +102 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/__init__.py +1 -1
- risk_network-0.0.8/risk/annotations/__init__.py +7 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/annotations/annotations.py +85 -53
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/annotations/io.py +3 -3
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/log/__init__.py +1 -1
- risk_network-0.0.7b12/risk/log/config.py → risk_network-0.0.8/risk/log/console.py +2 -2
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/log/params.py +6 -6
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/neighborhoods/community.py +68 -61
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/neighborhoods/domains.py +41 -18
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/neighborhoods/neighborhoods.py +134 -69
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/network/geometry.py +5 -2
- risk_network-0.0.8/risk/network/graph.py +219 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/network/io.py +44 -6
- risk_network-0.0.8/risk/network/plot/__init__.py +6 -0
- risk_network-0.0.8/risk/network/plot/canvas.py +290 -0
- risk_network-0.0.8/risk/network/plot/contour.py +327 -0
- risk_network-0.0.8/risk/network/plot/labels.py +929 -0
- risk_network-0.0.8/risk/network/plot/network.py +288 -0
- risk_network-0.0.8/risk/network/plot/plotter.py +137 -0
- risk_network-0.0.8/risk/network/plot/utils/color.py +424 -0
- risk_network-0.0.8/risk/network/plot/utils/layout.py +91 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/risk.py +70 -37
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/stats/hypergeom.py +1 -1
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/stats/permutation/permutation.py +21 -8
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/stats/poisson.py +2 -2
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/stats/stats.py +12 -10
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk_network.egg-info/PKG-INFO +84 -21
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk_network.egg-info/SOURCES.txt +9 -2
- risk_network-0.0.7b12/README.md +0 -39
- risk_network-0.0.7b12/risk/annotations/__init__.py +0 -7
- risk_network-0.0.7b12/risk/network/graph.py +0 -385
- risk_network-0.0.7b12/risk/network/plot.py +0 -1450
- {risk_network-0.0.7b12 → risk_network-0.0.8}/LICENSE +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/MANIFEST.in +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/pyproject.toml +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/constants.py +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/neighborhoods/__init__.py +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/network/__init__.py +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/stats/__init__.py +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/stats/permutation/__init__.py +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk/stats/permutation/test_functions.py +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk_network.egg-info/dependency_links.txt +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk_network.egg-info/requires.txt +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/risk_network.egg-info/top_level.txt +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/setup.cfg +0 -0
- {risk_network-0.0.7b12 → risk_network-0.0.8}/setup.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: risk-network
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.8
|
4
4
|
Summary: A Python package for biological network analysis
|
5
5
|
Author: Ira Horecka
|
6
6
|
Author-email: Ira Horecka <ira89@icloud.com>
|
@@ -709,42 +709,105 @@ Requires-Dist: statsmodels
|
|
709
709
|
Requires-Dist: threadpoolctl
|
710
710
|
Requires-Dist: tqdm
|
711
711
|
|
712
|
-
|
713
|
-
<img src="https://i.imgur.com/Fo9EmnK.png" width="400" />
|
714
|
-
</p>
|
712
|
+
# RISK Network
|
715
713
|
|
716
714
|
<p align="center">
|
717
|
-
<
|
718
|
-
<a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.8+-blue.svg" alt="Python 3.8+"></a>
|
719
|
-
<a href="https://raw.githubusercontent.com/irahorecka/chrono24/main/LICENSE"><img src="https://img.shields.io/badge/License-GPLv3-blue.svg" alt="License: GPL v3"></a>
|
715
|
+
<img src="https://i.imgur.com/8TleEJs.png" width="50%" />
|
720
716
|
</p>
|
721
717
|
|
722
|
-
|
718
|
+
<br>
|
719
|
+
|
720
|
+

|
721
|
+
[](https://pypi.python.org/pypi/risk-network)
|
722
|
+

|
723
|
+
[](https://doi.org/10.5281/zenodo.xxxxxxx)
|
724
|
+

|
725
|
+

|
726
|
+
|
727
|
+
**RISK (RISK Infers Spatial Kinships)** is a next-generation tool designed to streamline the analysis of biological and non-biological networks. RISK enhances network analysis with its modular architecture, extensive file format support, and advanced clustering algorithms. It simplifies the creation of publication-quality figures, making it an important tool for researchers across disciplines.
|
723
728
|
|
724
|
-
|
729
|
+
## Documentation and Tutorial
|
730
|
+
|
731
|
+
- **Documentation**: Comprehensive documentation is available [here](Documentation link).
|
732
|
+
- **Tutorial**: An interactive Jupyter notebook tutorial can be found [here](https://github.com/riskportal/network-tutorial).
|
733
|
+
We highly recommend new users to consult the documentation and tutorial early on to fully leverage RISK's capabilities.
|
734
|
+
|
735
|
+
## Installation
|
725
736
|
|
726
|
-
RISK is
|
737
|
+
RISK is compatible with Python 3.8 and later versions and operates on all major operating systems. Install RISK via pip:
|
738
|
+
|
739
|
+
```bash
|
740
|
+
pip install risk-network
|
741
|
+
```
|
727
742
|
|
728
743
|
## Features
|
729
744
|
|
730
|
-
-
|
731
|
-
-
|
732
|
-
-
|
745
|
+
- **Comprehensive Network Analysis**: Analyze biological networks such as protein–protein interaction (PPI) and gene regulatory networks, as well as non-biological networks.
|
746
|
+
- **Advanced Clustering Algorithms**: Utilize algorithms like Louvain, Markov Clustering, Spinglass, and more to identify key functional modules.
|
747
|
+
- **Flexible Visualization**: Generate clear, publication-quality figures with customizable node and edge attributes, including colors, shapes, sizes, and labels.
|
748
|
+
- **Efficient Data Handling**: Optimized for large datasets, supporting multiple file formats such as JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
|
749
|
+
- **Statistical Analysis**: Integrated statistical tests, including hypergeometric, permutation, and Poisson tests, to assess the significance of enriched regions.
|
750
|
+
- **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
|
733
751
|
|
734
|
-
## Example
|
752
|
+
## Example Usage
|
735
753
|
|
736
|
-
*Saccharomyces cerevisiae*
|
754
|
+
We applied RISK to a *Saccharomyces cerevisiae* protein–protein interaction network, revealing both established and novel functional relationships. The visualization below highlights key biological processes such as ribosomal assembly and mitochondrial organization.
|
737
755
|
|
738
|
-

|
739
757
|
|
740
|
-
|
758
|
+
RISK successfully detected both known and novel functional clusters within the yeast interactome. Clusters related to Golgi transport and actin nucleation were clearly defined and closely located, showcasing RISK's ability to map well-characterized interactions. Additionally, RISK identified links between mRNA processing pathways and vesicle trafficking proteins, consistent with recent studies demonstrating the role of vesicles in mRNA localization and stability.
|
759
|
+
|
760
|
+
## Citation
|
761
|
+
|
762
|
+
If you use RISK in your research, please cite the following:
|
763
|
+
|
764
|
+
**Horecka**, *et al.*, "RISK: a next-generation tool for biological network annotation and visualization", **[Journal Name]**, 2024. DOI: [10.1234/zenodo.xxxxxxx](https://doi.org/10.1234/zenodo.xxxxxxx)
|
765
|
+
|
766
|
+
## Software Architecture and Implementation
|
741
767
|
|
742
|
-
|
768
|
+
RISK features a streamlined, modular architecture designed to meet diverse research needs. Each module focuses on a specific task—such as network input/output, statistical analysis, or visualization—ensuring ease of adaptation and extension. This design enhances flexibility and reduces development overhead for users integrating RISK into their workflows.
|
743
769
|
|
744
|
-
|
770
|
+
### Supported Data Formats
|
745
771
|
|
746
|
-
|
772
|
+
- **Input/Output**: JSON, CSV, TSV, Excel, Cytoscape, GPickle.
|
773
|
+
- **Visualization Outputs**: SVG, PNG, PDF.
|
774
|
+
|
775
|
+
### Clustering Algorithms
|
776
|
+
|
777
|
+
- **Available Algorithms**:
|
778
|
+
- Greedy Modularity
|
779
|
+
- Label Propagation
|
780
|
+
- Louvain
|
781
|
+
- Markov Clustering
|
782
|
+
- Spinglass
|
783
|
+
- Walktrap
|
784
|
+
- **Distance Metrics**: Supports both spherical and Euclidean distance metrics.
|
785
|
+
|
786
|
+
### Statistical Tests
|
787
|
+
|
788
|
+
- **Hypergeometric Test**
|
789
|
+
- **Permutation Test** (single- or multi-process modes)
|
790
|
+
- **Poisson Test**
|
791
|
+
|
792
|
+
## Performance and Efficiency
|
793
|
+
|
794
|
+
In benchmarking tests using the yeast interactome network, RISK demonstrated substantial improvements over previous tools in both computational performance and memory efficiency. RISK processed the dataset approximately **3.25 times faster**, reducing CPU time by **69%**, and required **25% less peak memory usage**, underscoring its efficient utilization of computational resources.
|
795
|
+
|
796
|
+
## Contributing
|
797
|
+
|
798
|
+
We welcome contributions from the community. Please use the following resources:
|
799
|
+
|
800
|
+
- [Issues Tracker](https://github.com/irahorecka/risk/issues)
|
801
|
+
- [Source Code](https://github.com/irahorecka/risk/tree/main/risk)
|
802
|
+
|
803
|
+
## Support
|
804
|
+
|
805
|
+
If you encounter issues or have suggestions for new features, please use the [Issues Tracker](https://github.com/irahorecka/risk/issues) on GitHub.
|
747
806
|
|
748
807
|
## License
|
749
808
|
|
750
|
-
|
809
|
+
RISK is freely available as open-source software under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
810
|
+
|
811
|
+
---
|
812
|
+
|
813
|
+
**Note**: For detailed documentation and to access the interactive tutorial, please visit the links provided in the [Documentation and Tutorial](#documentation-and-tutorial) section.
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# RISK Network
|
2
|
+
|
3
|
+
<p align="center">
|
4
|
+
<img src="https://i.imgur.com/8TleEJs.png" width="50%" />
|
5
|
+
</p>
|
6
|
+
|
7
|
+
<br>
|
8
|
+
|
9
|
+

|
10
|
+
[](https://pypi.python.org/pypi/risk-network)
|
11
|
+

|
12
|
+
[](https://doi.org/10.5281/zenodo.xxxxxxx)
|
13
|
+

|
14
|
+

|
15
|
+
|
16
|
+
**RISK (RISK Infers Spatial Kinships)** is a next-generation tool designed to streamline the analysis of biological and non-biological networks. RISK enhances network analysis with its modular architecture, extensive file format support, and advanced clustering algorithms. It simplifies the creation of publication-quality figures, making it an important tool for researchers across disciplines.
|
17
|
+
|
18
|
+
## Documentation and Tutorial
|
19
|
+
|
20
|
+
- **Documentation**: Comprehensive documentation is available [here](Documentation link).
|
21
|
+
- **Tutorial**: An interactive Jupyter notebook tutorial can be found [here](https://github.com/riskportal/network-tutorial).
|
22
|
+
We highly recommend new users to consult the documentation and tutorial early on to fully leverage RISK's capabilities.
|
23
|
+
|
24
|
+
## Installation
|
25
|
+
|
26
|
+
RISK is compatible with Python 3.8 and later versions and operates on all major operating systems. Install RISK via pip:
|
27
|
+
|
28
|
+
```bash
|
29
|
+
pip install risk-network
|
30
|
+
```
|
31
|
+
|
32
|
+
## Features
|
33
|
+
|
34
|
+
- **Comprehensive Network Analysis**: Analyze biological networks such as protein–protein interaction (PPI) and gene regulatory networks, as well as non-biological networks.
|
35
|
+
- **Advanced Clustering Algorithms**: Utilize algorithms like Louvain, Markov Clustering, Spinglass, and more to identify key functional modules.
|
36
|
+
- **Flexible Visualization**: Generate clear, publication-quality figures with customizable node and edge attributes, including colors, shapes, sizes, and labels.
|
37
|
+
- **Efficient Data Handling**: Optimized for large datasets, supporting multiple file formats such as JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
|
38
|
+
- **Statistical Analysis**: Integrated statistical tests, including hypergeometric, permutation, and Poisson tests, to assess the significance of enriched regions.
|
39
|
+
- **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
|
40
|
+
|
41
|
+
## Example Usage
|
42
|
+
|
43
|
+
We applied RISK to a *Saccharomyces cerevisiae* protein–protein interaction network, revealing both established and novel functional relationships. The visualization below highlights key biological processes such as ribosomal assembly and mitochondrial organization.
|
44
|
+
|
45
|
+

|
46
|
+
|
47
|
+
RISK successfully detected both known and novel functional clusters within the yeast interactome. Clusters related to Golgi transport and actin nucleation were clearly defined and closely located, showcasing RISK's ability to map well-characterized interactions. Additionally, RISK identified links between mRNA processing pathways and vesicle trafficking proteins, consistent with recent studies demonstrating the role of vesicles in mRNA localization and stability.
|
48
|
+
|
49
|
+
## Citation
|
50
|
+
|
51
|
+
If you use RISK in your research, please cite the following:
|
52
|
+
|
53
|
+
**Horecka**, *et al.*, "RISK: a next-generation tool for biological network annotation and visualization", **[Journal Name]**, 2024. DOI: [10.1234/zenodo.xxxxxxx](https://doi.org/10.1234/zenodo.xxxxxxx)
|
54
|
+
|
55
|
+
## Software Architecture and Implementation
|
56
|
+
|
57
|
+
RISK features a streamlined, modular architecture designed to meet diverse research needs. Each module focuses on a specific task—such as network input/output, statistical analysis, or visualization—ensuring ease of adaptation and extension. This design enhances flexibility and reduces development overhead for users integrating RISK into their workflows.
|
58
|
+
|
59
|
+
### Supported Data Formats
|
60
|
+
|
61
|
+
- **Input/Output**: JSON, CSV, TSV, Excel, Cytoscape, GPickle.
|
62
|
+
- **Visualization Outputs**: SVG, PNG, PDF.
|
63
|
+
|
64
|
+
### Clustering Algorithms
|
65
|
+
|
66
|
+
- **Available Algorithms**:
|
67
|
+
- Greedy Modularity
|
68
|
+
- Label Propagation
|
69
|
+
- Louvain
|
70
|
+
- Markov Clustering
|
71
|
+
- Spinglass
|
72
|
+
- Walktrap
|
73
|
+
- **Distance Metrics**: Supports both spherical and Euclidean distance metrics.
|
74
|
+
|
75
|
+
### Statistical Tests
|
76
|
+
|
77
|
+
- **Hypergeometric Test**
|
78
|
+
- **Permutation Test** (single- or multi-process modes)
|
79
|
+
- **Poisson Test**
|
80
|
+
|
81
|
+
## Performance and Efficiency
|
82
|
+
|
83
|
+
In benchmarking tests using the yeast interactome network, RISK demonstrated substantial improvements over previous tools in both computational performance and memory efficiency. RISK processed the dataset approximately **3.25 times faster**, reducing CPU time by **69%**, and required **25% less peak memory usage**, underscoring its efficient utilization of computational resources.
|
84
|
+
|
85
|
+
## Contributing
|
86
|
+
|
87
|
+
We welcome contributions from the community. Please use the following resources:
|
88
|
+
|
89
|
+
- [Issues Tracker](https://github.com/irahorecka/risk/issues)
|
90
|
+
- [Source Code](https://github.com/irahorecka/risk/tree/main/risk)
|
91
|
+
|
92
|
+
## Support
|
93
|
+
|
94
|
+
If you encounter issues or have suggestions for new features, please use the [Issues Tracker](https://github.com/irahorecka/risk/issues) on GitHub.
|
95
|
+
|
96
|
+
## License
|
97
|
+
|
98
|
+
RISK is freely available as open-source software under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
99
|
+
|
100
|
+
---
|
101
|
+
|
102
|
+
**Note**: For detailed documentation and to access the interactive tutorial, please visit the links provided in the [Documentation and Tutorial](#documentation-and-tutorial) section.
|
@@ -3,6 +3,7 @@ risk/annotations/annotations
|
|
3
3
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
4
|
"""
|
5
5
|
|
6
|
+
import re
|
6
7
|
from collections import Counter
|
7
8
|
from itertools import compress
|
8
9
|
from typing import Any, Dict, List, Set
|
@@ -30,27 +31,30 @@ def _setup_nltk():
|
|
30
31
|
|
31
32
|
# Ensure you have the necessary NLTK data
|
32
33
|
_setup_nltk()
|
34
|
+
# Initialize English stopwords
|
35
|
+
stop_words = set(stopwords.words("english"))
|
33
36
|
|
34
37
|
|
35
38
|
def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Dict[str, Any]:
|
36
39
|
"""Convert annotations input to a DataFrame and reindex based on the network's node labels.
|
37
40
|
|
38
41
|
Args:
|
39
|
-
|
42
|
+
network (nx.Graph): The network graph.
|
43
|
+
annotations_input (Dict[str, Any]): A dictionary with annotations.
|
40
44
|
|
41
45
|
Returns:
|
42
|
-
|
46
|
+
Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the binary annotations matrix.
|
43
47
|
"""
|
44
48
|
# Flatten the dictionary to a list of tuples for easier DataFrame creation
|
45
49
|
flattened_annotations = [
|
46
50
|
(node, annotation) for annotation, nodes in annotations_input.items() for node in nodes
|
47
51
|
]
|
48
52
|
# Create a DataFrame from the flattened list
|
49
|
-
annotations = pd.DataFrame(flattened_annotations, columns=["
|
50
|
-
annotations["
|
53
|
+
annotations = pd.DataFrame(flattened_annotations, columns=["node", "annotations"])
|
54
|
+
annotations["is_member"] = 1
|
51
55
|
# Pivot to create a binary matrix with nodes as rows and annotations as columns
|
52
56
|
annotations_pivot = annotations.pivot_table(
|
53
|
-
index="
|
57
|
+
index="node", columns="annotations", values="is_member", fill_value=0, dropna=False
|
54
58
|
)
|
55
59
|
# Reindex the annotations matrix based on the node labels from the network
|
56
60
|
node_label_order = list(nx.get_node_attributes(network, "label").values())
|
@@ -80,7 +84,8 @@ def define_top_annotations(
|
|
80
84
|
network: nx.Graph,
|
81
85
|
ordered_annotation_labels: List[str],
|
82
86
|
neighborhood_enrichment_sums: List[int],
|
83
|
-
|
87
|
+
significant_enrichment_matrix: np.ndarray,
|
88
|
+
significant_binary_enrichment_matrix: np.ndarray,
|
84
89
|
min_cluster_size: int = 5,
|
85
90
|
max_cluster_size: int = 1000,
|
86
91
|
) -> pd.DataFrame:
|
@@ -90,42 +95,52 @@ def define_top_annotations(
|
|
90
95
|
network (NetworkX graph): The network graph.
|
91
96
|
ordered_annotation_labels (list of str): List of ordered annotation labels.
|
92
97
|
neighborhood_enrichment_sums (list of int): List of neighborhood enrichment sums.
|
93
|
-
|
98
|
+
significant_enrichment_matrix (np.ndarray): Enrichment matrix below alpha threshold.
|
99
|
+
significant_binary_enrichment_matrix (np.ndarray): Binary enrichment matrix below alpha threshold.
|
94
100
|
min_cluster_size (int, optional): Minimum cluster size. Defaults to 5.
|
95
101
|
max_cluster_size (int, optional): Maximum cluster size. Defaults to 1000.
|
96
102
|
|
97
103
|
Returns:
|
98
104
|
pd.DataFrame: DataFrame with top annotations and their properties.
|
99
105
|
"""
|
100
|
-
#
|
106
|
+
# Sum the columns of the significant enrichment matrix (positive floating point values)
|
107
|
+
significant_enrichment_scores = significant_enrichment_matrix.sum(axis=0)
|
108
|
+
# Create DataFrame to store annotations, their neighborhood enrichment sums, and enrichment scores
|
101
109
|
annotations_enrichment_matrix = pd.DataFrame(
|
102
110
|
{
|
103
111
|
"id": range(len(ordered_annotation_labels)),
|
104
|
-
"
|
105
|
-
"
|
112
|
+
"full_terms": ordered_annotation_labels,
|
113
|
+
"significant_neighborhood_enrichment_sums": neighborhood_enrichment_sums,
|
114
|
+
"significant_enrichment_score": significant_enrichment_scores,
|
106
115
|
}
|
107
116
|
)
|
108
|
-
annotations_enrichment_matrix["
|
109
|
-
# Apply size constraints to identify potential
|
117
|
+
annotations_enrichment_matrix["significant_annotations"] = False
|
118
|
+
# Apply size constraints to identify potential significant annotations
|
110
119
|
annotations_enrichment_matrix.loc[
|
111
|
-
(
|
112
|
-
|
113
|
-
|
120
|
+
(
|
121
|
+
annotations_enrichment_matrix["significant_neighborhood_enrichment_sums"]
|
122
|
+
>= min_cluster_size
|
123
|
+
)
|
124
|
+
& (
|
125
|
+
annotations_enrichment_matrix["significant_neighborhood_enrichment_sums"]
|
126
|
+
<= max_cluster_size
|
127
|
+
),
|
128
|
+
"significant_annotations",
|
114
129
|
] = True
|
115
130
|
# Initialize columns for connected components analysis
|
116
|
-
annotations_enrichment_matrix["
|
117
|
-
annotations_enrichment_matrix["
|
118
|
-
annotations_enrichment_matrix["
|
119
|
-
"
|
131
|
+
annotations_enrichment_matrix["num_connected_components"] = 0
|
132
|
+
annotations_enrichment_matrix["size_connected_components"] = None
|
133
|
+
annotations_enrichment_matrix["size_connected_components"] = annotations_enrichment_matrix[
|
134
|
+
"size_connected_components"
|
120
135
|
].astype(object)
|
121
|
-
annotations_enrichment_matrix["
|
136
|
+
annotations_enrichment_matrix["num_large_connected_components"] = 0
|
122
137
|
|
123
138
|
for attribute in annotations_enrichment_matrix.index.values[
|
124
|
-
annotations_enrichment_matrix["
|
139
|
+
annotations_enrichment_matrix["significant_annotations"]
|
125
140
|
]:
|
126
141
|
# Identify enriched neighborhoods based on the binary enrichment matrix
|
127
142
|
enriched_neighborhoods = list(
|
128
|
-
compress(list(network),
|
143
|
+
compress(list(network), significant_binary_enrichment_matrix[:, attribute])
|
129
144
|
)
|
130
145
|
enriched_network = nx.subgraph(network, enriched_neighborhoods)
|
131
146
|
# Analyze connected components within the enriched subnetwork
|
@@ -144,57 +159,74 @@ def define_top_annotations(
|
|
144
159
|
num_large_connected_components = len(filtered_size_connected_components)
|
145
160
|
|
146
161
|
# Assign the number of connected components
|
147
|
-
annotations_enrichment_matrix.loc[attribute, "
|
162
|
+
annotations_enrichment_matrix.loc[attribute, "num_connected_components"] = (
|
148
163
|
num_connected_components
|
149
164
|
)
|
150
165
|
# Filter out attributes with more than one connected component
|
151
166
|
annotations_enrichment_matrix.loc[
|
152
|
-
annotations_enrichment_matrix["
|
167
|
+
annotations_enrichment_matrix["num_connected_components"] > 1, "significant_annotations"
|
153
168
|
] = False
|
154
169
|
# Assign the number of large connected components
|
155
|
-
annotations_enrichment_matrix.loc[attribute, "
|
170
|
+
annotations_enrichment_matrix.loc[attribute, "num_large_connected_components"] = (
|
156
171
|
num_large_connected_components
|
157
172
|
)
|
158
173
|
# Assign the size of connected components, ensuring it is always a list
|
159
|
-
annotations_enrichment_matrix.at[attribute, "
|
174
|
+
annotations_enrichment_matrix.at[attribute, "size_connected_components"] = (
|
160
175
|
filtered_size_connected_components.tolist()
|
161
176
|
)
|
162
177
|
|
163
178
|
return annotations_enrichment_matrix
|
164
179
|
|
165
180
|
|
166
|
-
def
|
167
|
-
"""
|
168
|
-
|
181
|
+
def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
|
182
|
+
"""Generate a weighted description from words and their corresponding scores,
|
183
|
+
with support for stopwords filtering and improved weighting logic.
|
169
184
|
|
170
185
|
Args:
|
171
186
|
words_column (pd.Series): A pandas Series containing strings to process.
|
187
|
+
scores_column (pd.Series): A pandas Series containing enrichment scores to weigh the terms.
|
172
188
|
|
173
189
|
Returns:
|
174
|
-
str: A coherent description formed from the most frequent and significant words.
|
190
|
+
str: A coherent description formed from the most frequent and significant words, weighed by enrichment scores.
|
175
191
|
"""
|
176
|
-
#
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
192
|
+
# Handle case where all scores are the same
|
193
|
+
if scores_column.max() == scores_column.min():
|
194
|
+
normalized_scores = pd.Series([1] * len(scores_column))
|
195
|
+
else:
|
196
|
+
# Normalize the enrichment scores to be between 0 and 1
|
197
|
+
normalized_scores = (scores_column - scores_column.min()) / (
|
198
|
+
scores_column.max() - scores_column.min()
|
199
|
+
)
|
200
|
+
|
201
|
+
# Combine words and normalized scores to create weighted words
|
202
|
+
weighted_words = []
|
203
|
+
for word, score in zip(words_column, normalized_scores):
|
204
|
+
word = str(word)
|
205
|
+
if word not in stop_words: # Skip stopwords
|
206
|
+
weight = max(1, int((0 if pd.isna(score) else score) * 10))
|
207
|
+
weighted_words.extend([word] * weight)
|
208
|
+
|
209
|
+
# Tokenize the weighted words, but preserve number-word patterns like '4-alpha'
|
210
|
+
tokens = word_tokenize(" ".join(weighted_words))
|
211
|
+
# Ensure we treat "4-alpha" or other "number-word" patterns as single tokens
|
212
|
+
combined_tokens = []
|
213
|
+
for token in tokens:
|
214
|
+
# Match patterns like '4-alpha' or '5-hydroxy' and keep them together
|
215
|
+
if re.match(r"^\d+-\w+", token):
|
216
|
+
combined_tokens.append(token)
|
217
|
+
elif token.replace(".", "", 1).isdigit(): # Handle pure numeric tokens
|
218
|
+
# Ignore pure numbers as descriptions unless necessary
|
219
|
+
continue
|
220
|
+
else:
|
221
|
+
combined_tokens.append(token)
|
222
|
+
|
223
|
+
# Prevent descriptions like just '4' from being selected
|
224
|
+
if len(combined_tokens) == 1 and combined_tokens[0].isdigit():
|
225
|
+
return "N/A" # Return "N/A" for cases where it's just a number
|
226
|
+
|
227
|
+
# Simplify the word list and generate the description
|
228
|
+
simplified_words = _simplify_word_list(combined_tokens)
|
229
|
+
description = _generate_coherent_description(simplified_words)
|
198
230
|
|
199
231
|
return description
|
200
232
|
|
@@ -257,7 +289,7 @@ def _generate_coherent_description(words: List[str]) -> str:
|
|
257
289
|
If there is only one unique entry, return it directly.
|
258
290
|
|
259
291
|
Args:
|
260
|
-
words (
|
292
|
+
words (List): A list of words or numerical string values.
|
261
293
|
|
262
294
|
Returns:
|
263
295
|
str: A coherent description formed by arranging the words in a logical sequence.
|
@@ -33,7 +33,7 @@ class AnnotationsIO:
|
|
33
33
|
filepath (str): Path to the JSON annotations file.
|
34
34
|
|
35
35
|
Returns:
|
36
|
-
|
36
|
+
Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the annotations matrix.
|
37
37
|
"""
|
38
38
|
filetype = "JSON"
|
39
39
|
# Log the loading of the JSON file
|
@@ -158,10 +158,10 @@ class AnnotationsIO:
|
|
158
158
|
|
159
159
|
Args:
|
160
160
|
network (NetworkX graph): The network to which the annotations are related.
|
161
|
-
content (
|
161
|
+
content (Dict[str, Any]): The annotations dictionary to load.
|
162
162
|
|
163
163
|
Returns:
|
164
|
-
|
164
|
+
Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the annotations matrix.
|
165
165
|
"""
|
166
166
|
# Ensure the input content is a dictionary
|
167
167
|
if not isinstance(content, dict):
|
@@ -12,7 +12,7 @@ from typing import Any, Dict
|
|
12
12
|
|
13
13
|
import numpy as np
|
14
14
|
|
15
|
-
from .
|
15
|
+
from .console import logger, log_header
|
16
16
|
|
17
17
|
# Suppress all warnings - this is to resolve warnings from multiprocessing
|
18
18
|
warnings.filterwarnings("ignore")
|
@@ -159,7 +159,7 @@ class Params:
|
|
159
159
|
"""Load and process various parameters, converting any np.ndarray values to lists.
|
160
160
|
|
161
161
|
Returns:
|
162
|
-
|
162
|
+
Dict[str, Any]: A dictionary containing the processed parameters.
|
163
163
|
"""
|
164
164
|
log_header("Loading parameters")
|
165
165
|
return _convert_ndarray_to_list(
|
@@ -174,14 +174,14 @@ class Params:
|
|
174
174
|
)
|
175
175
|
|
176
176
|
|
177
|
-
def _convert_ndarray_to_list(d: Any) -> Any:
|
177
|
+
def _convert_ndarray_to_list(d: Dict[str, Any]) -> Dict[str, Any]:
|
178
178
|
"""Recursively convert all np.ndarray values in the dictionary to lists.
|
179
179
|
|
180
180
|
Args:
|
181
|
-
d (
|
181
|
+
d (Dict[str, Any]): The dictionary to process.
|
182
182
|
|
183
183
|
Returns:
|
184
|
-
|
184
|
+
Dict[str, Any]: The processed dictionary with np.ndarray values converted to lists.
|
185
185
|
"""
|
186
186
|
if isinstance(d, dict):
|
187
187
|
# Recursively process each value in the dictionary
|
@@ -193,5 +193,5 @@ def _convert_ndarray_to_list(d: Any) -> Any:
|
|
193
193
|
# Convert numpy arrays to lists
|
194
194
|
return d.tolist()
|
195
195
|
else:
|
196
|
-
# Return the value unchanged if it's not a dict,
|
196
|
+
# Return the value unchanged if it's not a dict, List, or ndarray
|
197
197
|
return d
|