risk-network 0.0.8b26__tar.gz → 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {risk_network-0.0.8b26 → risk_network-0.0.9}/PKG-INFO +29 -44
  2. risk_network-0.0.9/README.md +83 -0
  3. {risk_network-0.0.8b26 → risk_network-0.0.9}/pyproject.toml +2 -0
  4. risk_network-0.0.9/risk/__init__.py +10 -0
  5. risk_network-0.0.9/risk/annotations/__init__.py +7 -0
  6. risk_network-0.0.9/risk/annotations/annotations.py +389 -0
  7. {risk_network-0.0.8b26 → risk_network-0.0.9}/risk/annotations/io.py +47 -31
  8. risk_network-0.0.9/risk/log/__init__.py +11 -0
  9. risk_network-0.0.8b26/risk/log/config.py → risk_network-0.0.9/risk/log/console.py +5 -3
  10. risk_network-0.0.8b26/risk/log/params.py → risk_network-0.0.9/risk/log/parameters.py +17 -42
  11. risk_network-0.0.9/risk/neighborhoods/__init__.py +8 -0
  12. risk_network-0.0.9/risk/neighborhoods/api.py +442 -0
  13. risk_network-0.0.9/risk/neighborhoods/community.py +412 -0
  14. {risk_network-0.0.8b26 → risk_network-0.0.9}/risk/neighborhoods/domains.py +125 -52
  15. {risk_network-0.0.8b26 → risk_network-0.0.9}/risk/neighborhoods/neighborhoods.py +177 -165
  16. risk_network-0.0.9/risk/network/__init__.py +6 -0
  17. risk_network-0.0.9/risk/network/geometry.py +150 -0
  18. risk_network-0.0.9/risk/network/graph/__init__.py +6 -0
  19. risk_network-0.0.9/risk/network/graph/api.py +200 -0
  20. {risk_network-0.0.8b26/risk/network → risk_network-0.0.9/risk/network/graph}/graph.py +90 -40
  21. risk_network-0.0.9/risk/network/graph/summary.py +254 -0
  22. {risk_network-0.0.8b26 → risk_network-0.0.9}/risk/network/io.py +103 -114
  23. risk_network-0.0.9/risk/network/plotter/__init__.py +6 -0
  24. risk_network-0.0.9/risk/network/plotter/api.py +54 -0
  25. {risk_network-0.0.8b26/risk/network/plot → risk_network-0.0.9/risk/network/plotter}/canvas.py +12 -9
  26. {risk_network-0.0.8b26/risk/network/plot → risk_network-0.0.9/risk/network/plotter}/contour.py +27 -24
  27. {risk_network-0.0.8b26/risk/network/plot → risk_network-0.0.9/risk/network/plotter}/labels.py +73 -78
  28. {risk_network-0.0.8b26/risk/network/plot → risk_network-0.0.9/risk/network/plotter}/network.py +45 -39
  29. {risk_network-0.0.8b26/risk/network/plot → risk_network-0.0.9/risk/network/plotter}/plotter.py +23 -17
  30. risk_network-0.0.8b26/risk/network/plot/utils/color.py → risk_network-0.0.9/risk/network/plotter/utils/colors.py +114 -122
  31. {risk_network-0.0.8b26/risk/network/plot → risk_network-0.0.9/risk/network/plotter}/utils/layout.py +10 -7
  32. risk_network-0.0.9/risk/risk.py +33 -0
  33. risk_network-0.0.9/risk/stats/__init__.py +15 -0
  34. risk_network-0.0.9/risk/stats/permutation/__init__.py +6 -0
  35. {risk_network-0.0.8b26 → risk_network-0.0.9}/risk/stats/permutation/permutation.py +44 -38
  36. risk_network-0.0.9/risk/stats/permutation/test_functions.py +69 -0
  37. risk_network-0.0.8b26/risk/stats/stats.py → risk_network-0.0.9/risk/stats/significance.py +17 -15
  38. risk_network-0.0.9/risk/stats/stat_tests.py +267 -0
  39. {risk_network-0.0.8b26 → risk_network-0.0.9}/risk_network.egg-info/PKG-INFO +29 -44
  40. {risk_network-0.0.8b26 → risk_network-0.0.9}/risk_network.egg-info/SOURCES.txt +18 -15
  41. {risk_network-0.0.8b26 → risk_network-0.0.9}/risk_network.egg-info/requires.txt +2 -0
  42. {risk_network-0.0.8b26 → risk_network-0.0.9}/setup.py +7 -3
  43. risk_network-0.0.8b26/README.md +0 -102
  44. risk_network-0.0.8b26/risk/__init__.py +0 -10
  45. risk_network-0.0.8b26/risk/annotations/__init__.py +0 -7
  46. risk_network-0.0.8b26/risk/annotations/annotations.py +0 -312
  47. risk_network-0.0.8b26/risk/constants.py +0 -31
  48. risk_network-0.0.8b26/risk/log/__init__.py +0 -9
  49. risk_network-0.0.8b26/risk/neighborhoods/__init__.py +0 -10
  50. risk_network-0.0.8b26/risk/neighborhoods/community.py +0 -189
  51. risk_network-0.0.8b26/risk/network/__init__.py +0 -8
  52. risk_network-0.0.8b26/risk/network/geometry.py +0 -168
  53. risk_network-0.0.8b26/risk/network/plot/__init__.py +0 -6
  54. risk_network-0.0.8b26/risk/risk.py +0 -522
  55. risk_network-0.0.8b26/risk/stats/__init__.py +0 -9
  56. risk_network-0.0.8b26/risk/stats/hypergeom.py +0 -54
  57. risk_network-0.0.8b26/risk/stats/permutation/__init__.py +0 -6
  58. risk_network-0.0.8b26/risk/stats/permutation/test_functions.py +0 -61
  59. risk_network-0.0.8b26/risk/stats/poisson.py +0 -44
  60. {risk_network-0.0.8b26 → risk_network-0.0.9}/LICENSE +0 -0
  61. {risk_network-0.0.8b26 → risk_network-0.0.9}/MANIFEST.in +0 -0
  62. {risk_network-0.0.8b26 → risk_network-0.0.9}/risk_network.egg-info/dependency_links.txt +0 -0
  63. {risk_network-0.0.8b26 → risk_network-0.0.9}/risk_network.egg-info/top_level.txt +0 -0
  64. {risk_network-0.0.8b26 → risk_network-0.0.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: risk-network
3
- Version: 0.0.8b26
3
+ Version: 0.0.9
4
4
  Summary: A Python package for biological network analysis
5
5
  Author: Ira Horecka
6
6
  Author-email: Ira Horecka <ira89@icloud.com>
@@ -695,6 +695,7 @@ Requires-Python: >=3.8
695
695
  Description-Content-Type: text/markdown
696
696
  License-File: LICENSE
697
697
  Requires-Dist: ipywidgets
698
+ Requires-Dist: leidenalg
698
699
  Requires-Dist: markov_clustering
699
700
  Requires-Dist: matplotlib
700
701
  Requires-Dist: networkx
@@ -702,12 +703,15 @@ Requires-Dist: nltk==3.8.1
702
703
  Requires-Dist: numpy
703
704
  Requires-Dist: openpyxl
704
705
  Requires-Dist: pandas
706
+ Requires-Dist: python-igraph
705
707
  Requires-Dist: python-louvain
706
708
  Requires-Dist: scikit-learn
707
709
  Requires-Dist: scipy
708
710
  Requires-Dist: statsmodels
709
711
  Requires-Dist: threadpoolctl
710
712
  Requires-Dist: tqdm
713
+ Dynamic: author
714
+ Dynamic: requires-python
711
715
 
712
716
  # RISK Network
713
717
 
@@ -724,78 +728,59 @@ Requires-Dist: tqdm
724
728
  ![Downloads](https://img.shields.io/pypi/dm/risk-network)
725
729
  ![Platforms](https://img.shields.io/badge/platform-linux%20%7C%20macos%20%7C%20windows-lightgrey)
726
730
 
727
- **RISK (RISK Infers Spatial Kinships)** is a next-generation tool designed to streamline the analysis of biological and non-biological networks. RISK enhances network analysis with its modular architecture, extensive file format support, and advanced clustering algorithms. It simplifies the creation of publication-quality figures, making it an important tool for researchers across disciplines.
731
+ **RISK** (Regional Inference of Significant Kinships) is a next-generation tool for biological network annotation and visualization. RISK integrates community detection-based clustering, rigorous statistical enrichment analysis, and a modular framework to uncover biologically meaningful relationships and generate high-resolution visualizations. RISK supports diverse data formats and is optimized for large-scale network analysis, making it a valuable resource for researchers in systems biology and beyond.
728
732
 
729
733
  ## Documentation and Tutorial
730
734
 
731
- - **Documentation**: Comprehensive documentation is available [here](Documentation link).
732
- - **Tutorial**: An interactive Jupyter notebook tutorial can be found [here](https://github.com/riskportal/network-tutorial).
733
- We highly recommend new users to consult the documentation and tutorial early on to fully leverage RISK's capabilities.
735
+ An interactive Jupyter notebook tutorial can be found [here](https://github.com/riskportal/network-tutorial). We highly recommend new users to consult the documentation and tutorial early on to fully utilize RISK's capabilities.
734
736
 
735
737
  ## Installation
736
738
 
737
- RISK is compatible with Python 3.8 and later versions and operates on all major operating systems. Install RISK via pip:
739
+ RISK is compatible with Python 3.8 or later and runs on all major operating systems. To install the latest version of RISK, run:
738
740
 
739
741
  ```bash
740
- pip install risk-network
742
+ pip install risk-network --upgrade
741
743
  ```
742
744
 
743
745
  ## Features
744
746
 
745
- - **Comprehensive Network Analysis**: Analyze biological networks such as protein–protein interaction (PPI) and gene regulatory networks, as well as non-biological networks.
746
- - **Advanced Clustering Algorithms**: Utilize algorithms like Louvain, Markov Clustering, Spinglass, and more to identify key functional modules.
747
- - **Flexible Visualization**: Generate clear, publication-quality figures with customizable node and edge attributes, including colors, shapes, sizes, and labels.
748
- - **Efficient Data Handling**: Optimized for large datasets, supporting multiple file formats such as JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
749
- - **Statistical Analysis**: Integrated statistical tests, including hypergeometric, permutation, and Poisson tests, to assess the significance of enriched regions.
747
+ - **Comprehensive Network Analysis**: Analyze biological networks (e.g., protein–protein interaction and genetic interaction networks) as well as non-biological networks.
748
+ - **Advanced Clustering Algorithms**: Supports Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap for identifying structured network regions.
749
+ - **Flexible Visualization**: Produce customizable, high-resolution network visualizations with kernel density estimate overlays, adjustable node and edge attributes, and export options in SVG, PNG, and PDF formats.
750
+ - **Efficient Data Handling**: Supports multiple input/output formats, including JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
751
+ - **Statistical Analysis**: Assess functional enrichment using hypergeometric, permutation, binomial, chi-squared, Poisson, and z-score tests, ensuring statistical adaptability across datasets.
750
752
  - **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
751
753
 
752
754
  ## Example Usage
753
755
 
754
- We applied RISK to a *Saccharomyces cerevisiae* protein–protein interaction network, revealing both established and novel functional relationships. The visualization below highlights key biological processes such as ribosomal assembly and mitochondrial organization.
756
+ We applied RISK to a *Saccharomyces cerevisiae* protein–protein interaction network from Michaelis et al. (2023), filtering for proteins with six or more interactions to emphasize core functional relationships. RISK identified compact, statistically enriched clusters corresponding to biological processes such as ribosomal assembly and mitochondrial organization.
755
757
 
756
- ![RISK Main Figure](https://i.imgur.com/5OP3Hqe.jpeg)
758
+ [![Figure 1](https://i.imgur.com/lJHJrJr.jpeg)](https://i.imgur.com/lJHJrJr.jpeg)
757
759
 
758
- RISK successfully detected both known and novel functional clusters within the yeast interactome. Clusters related to Golgi transport and actin nucleation were clearly defined and closely located, showcasing RISK's ability to map well-characterized interactions. Additionally, RISK identified links between mRNA processing pathways and vesicle trafficking proteins, consistent with recent studies demonstrating the role of vesicles in mRNA localization and stability.
760
+ This figure highlights RISK’s capability to detect both established and novel functional modules within the yeast interactome.
759
761
 
760
762
  ## Citation
761
763
 
762
- If you use RISK in your research, please cite the following:
764
+ If you use RISK in your research, please cite:
763
765
 
764
- **Horecka**, *et al.*, "RISK: a next-generation tool for biological network annotation and visualization", **[Journal Name]**, 2024. DOI: [10.1234/zenodo.xxxxxxx](https://doi.org/10.1234/zenodo.xxxxxxx)
766
+ **Horecka et al.**, "RISK: a next-generation tool for biological network annotation and visualization", **Bioinformatics**, 2025. DOI: [10.1234/zenodo.xxxxxxx](https://doi.org/10.1234/zenodo.xxxxxxx)
765
767
 
766
768
  ## Software Architecture and Implementation
767
769
 
768
- RISK features a streamlined, modular architecture designed to meet diverse research needs. Each module focuses on a specific task—such as network input/output, statistical analysis, or visualization—ensuring ease of adaptation and extension. This design enhances flexibility and reduces development overhead for users integrating RISK into their workflows.
770
+ RISK features a streamlined, modular architecture designed to meet diverse research needs. It includes dedicated modules for:
769
771
 
770
- ### Supported Data Formats
771
-
772
- - **Input/Output**: JSON, CSV, TSV, Excel, Cytoscape, GPickle.
773
- - **Visualization Outputs**: SVG, PNG, PDF.
774
-
775
- ### Clustering Algorithms
776
-
777
- - **Available Algorithms**:
778
- - Greedy Modularity
779
- - Label Propagation
780
- - Louvain
781
- - Markov Clustering
782
- - Spinglass
783
- - Walktrap
784
- - **Distance Metrics**: Supports both spherical and Euclidean distance metrics.
785
-
786
- ### Statistical Tests
787
-
788
- - **Hypergeometric Test**
789
- - **Permutation Test** (single- or multi-process modes)
790
- - **Poisson Test**
772
+ - **Data I/O**: Supports JSON, CSV, TSV, Excel, Cytoscape, and GPickle formats.
773
+ - **Clustering**: Supports multiple clustering methods, including Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap. Provides flexible distance metrics tailored to network structure.
774
+ - **Statistical Analysis**: Provides a suite of tests for overrepresentation analysis of annotations.
775
+ - **Visualization**: Offers customizable, high-resolution output in multiple formats, including SVG, PNG, and PDF.
791
776
 
792
777
  ## Performance and Efficiency
793
778
 
794
- In benchmarking tests using the yeast interactome network, RISK demonstrated substantial improvements over previous tools in both computational performance and memory efficiency. RISK processed the dataset approximately **3.25 times faster**, reducing CPU time by **69%**, and required **25% less peak memory usage**, underscoring its efficient utilization of computational resources.
779
+ Benchmarking results demonstrate that RISK efficiently scales to networks exceeding hundreds of thousands of edges, maintaining low execution times and optimal memory usage across statistical tests.
795
780
 
796
781
  ## Contributing
797
782
 
798
- We welcome contributions from the community. Please use the following resources:
783
+ We welcome contributions from the community:
799
784
 
800
785
  - [Issues Tracker](https://github.com/irahorecka/risk/issues)
801
786
  - [Source Code](https://github.com/irahorecka/risk/tree/main/risk)
@@ -806,8 +791,8 @@ If you encounter issues or have suggestions for new features, please use the [Is
806
791
 
807
792
  ## License
808
793
 
809
- RISK is freely available as open-source software under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
794
+ RISK is open source under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
810
795
 
811
796
  ---
812
797
 
813
- **Note**: For detailed documentation and to access the interactive tutorial, please visit the links provided in the [Documentation and Tutorial](#documentation-and-tutorial) section.
798
+ **Note**: For detailed documentation and to access the interactive tutorial, please visit the links above.
@@ -0,0 +1,83 @@
1
+ # RISK Network
2
+
3
+ <p align="center">
4
+ <img src="https://i.imgur.com/8TleEJs.png" width="50%" />
5
+ </p>
6
+
7
+ <br>
8
+
9
+ ![Python](https://img.shields.io/badge/python-3.8%2B-yellow)
10
+ [![pypiv](https://img.shields.io/pypi/v/risk-network.svg)](https://pypi.python.org/pypi/risk-network)
11
+ ![License](https://img.shields.io/badge/license-GPLv3-purple)
12
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.xxxxxxx.svg)](https://doi.org/10.5281/zenodo.xxxxxxx)
13
+ ![Downloads](https://img.shields.io/pypi/dm/risk-network)
14
+ ![Platforms](https://img.shields.io/badge/platform-linux%20%7C%20macos%20%7C%20windows-lightgrey)
15
+
16
+ **RISK** (Regional Inference of Significant Kinships) is a next-generation tool for biological network annotation and visualization. RISK integrates community detection-based clustering, rigorous statistical enrichment analysis, and a modular framework to uncover biologically meaningful relationships and generate high-resolution visualizations. RISK supports diverse data formats and is optimized for large-scale network analysis, making it a valuable resource for researchers in systems biology and beyond.
17
+
18
+ ## Documentation and Tutorial
19
+
20
+ An interactive Jupyter notebook tutorial can be found [here](https://github.com/riskportal/network-tutorial). We highly recommend new users to consult the documentation and tutorial early on to fully utilize RISK's capabilities.
21
+
22
+ ## Installation
23
+
24
+ RISK is compatible with Python 3.8 or later and runs on all major operating systems. To install the latest version of RISK, run:
25
+
26
+ ```bash
27
+ pip install risk-network --upgrade
28
+ ```
29
+
30
+ ## Features
31
+
32
+ - **Comprehensive Network Analysis**: Analyze biological networks (e.g., protein–protein interaction and genetic interaction networks) as well as non-biological networks.
33
+ - **Advanced Clustering Algorithms**: Supports Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap for identifying structured network regions.
34
+ - **Flexible Visualization**: Produce customizable, high-resolution network visualizations with kernel density estimate overlays, adjustable node and edge attributes, and export options in SVG, PNG, and PDF formats.
35
+ - **Efficient Data Handling**: Supports multiple input/output formats, including JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
36
+ - **Statistical Analysis**: Assess functional enrichment using hypergeometric, permutation, binomial, chi-squared, Poisson, and z-score tests, ensuring statistical adaptability across datasets.
37
+ - **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
38
+
39
+ ## Example Usage
40
+
41
+ We applied RISK to a *Saccharomyces cerevisiae* protein–protein interaction network from Michaelis et al. (2023), filtering for proteins with six or more interactions to emphasize core functional relationships. RISK identified compact, statistically enriched clusters corresponding to biological processes such as ribosomal assembly and mitochondrial organization.
42
+
43
+ [![Figure 1](https://i.imgur.com/lJHJrJr.jpeg)](https://i.imgur.com/lJHJrJr.jpeg)
44
+
45
+ This figure highlights RISK’s capability to detect both established and novel functional modules within the yeast interactome.
46
+
47
+ ## Citation
48
+
49
+ If you use RISK in your research, please cite:
50
+
51
+ **Horecka et al.**, "RISK: a next-generation tool for biological network annotation and visualization", **Bioinformatics**, 2025. DOI: [10.1234/zenodo.xxxxxxx](https://doi.org/10.1234/zenodo.xxxxxxx)
52
+
53
+ ## Software Architecture and Implementation
54
+
55
+ RISK features a streamlined, modular architecture designed to meet diverse research needs. It includes dedicated modules for:
56
+
57
+ - **Data I/O**: Supports JSON, CSV, TSV, Excel, Cytoscape, and GPickle formats.
58
+ - **Clustering**: Supports multiple clustering methods, including Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap. Provides flexible distance metrics tailored to network structure.
59
+ - **Statistical Analysis**: Provides a suite of tests for overrepresentation analysis of annotations.
60
+ - **Visualization**: Offers customizable, high-resolution output in multiple formats, including SVG, PNG, and PDF.
61
+
62
+ ## Performance and Efficiency
63
+
64
+ Benchmarking results demonstrate that RISK efficiently scales to networks exceeding hundreds of thousands of edges, maintaining low execution times and optimal memory usage across statistical tests.
65
+
66
+ ## Contributing
67
+
68
+ We welcome contributions from the community:
69
+
70
+ - [Issues Tracker](https://github.com/irahorecka/risk/issues)
71
+ - [Source Code](https://github.com/irahorecka/risk/tree/main/risk)
72
+
73
+ ## Support
74
+
75
+ If you encounter issues or have suggestions for new features, please use the [Issues Tracker](https://github.com/irahorecka/risk/issues) on GitHub.
76
+
77
+ ## License
78
+
79
+ RISK is open source under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
80
+
81
+ ---
82
+
83
+ **Note**: For detailed documentation and to access the interactive tutorial, please visit the links above.
@@ -27,6 +27,7 @@ classifiers = [
27
27
  ]
28
28
  dependencies = [
29
29
  "ipywidgets",
30
+ "leidenalg",
30
31
  "markov_clustering",
31
32
  "matplotlib",
32
33
  "networkx",
@@ -34,6 +35,7 @@ dependencies = [
34
35
  "numpy",
35
36
  "openpyxl",
36
37
  "pandas",
38
+ "python-igraph",
37
39
  "python-louvain",
38
40
  "scikit-learn",
39
41
  "scipy",
@@ -0,0 +1,10 @@
1
+ """
2
+ risk
3
+ ~~~~
4
+
5
+ RISK: Regional Inference of Significant Kinships
6
+ """
7
+
8
+ from risk.risk import RISK
9
+
10
+ __version__ = "0.0.9"
@@ -0,0 +1,7 @@
1
+ """
2
+ risk/annotations
3
+ ~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ from risk.annotations.annotations import define_top_annotations, get_weighted_description
7
+ from risk.annotations.io import AnnotationsIO
@@ -0,0 +1,389 @@
1
+ """
2
+ risk/annotations/annotations
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ import os
7
+ import re
8
+ import zipfile
9
+ from collections import Counter
10
+ from itertools import compress
11
+ from typing import Any, Dict, List, Set
12
+
13
+ import networkx as nx
14
+ import nltk
15
+ import numpy as np
16
+ import pandas as pd
17
+ from nltk.corpus import stopwords
18
+ from nltk.stem import WordNetLemmatizer
19
+ from nltk.tokenize import word_tokenize
20
+
21
+ from risk.log import logger
22
+ from scipy.sparse import coo_matrix
23
+
24
+
25
+ def ensure_nltk_resource(resource: str) -> None:
26
+ """Ensure the specified NLTK resource is available."""
27
+ # Define the path to the resource within the NLTK data directory
28
+ resource_path = f"corpora/{resource}"
29
+ # Check if the resource is already available.
30
+ try:
31
+ nltk.data.find(resource_path)
32
+ return
33
+ except LookupError:
34
+ print(f"Resource '{resource}' not found. Attempting to download...")
35
+
36
+ # Download the resource.
37
+ nltk.download(resource)
38
+ # Check again after downloading.
39
+ try:
40
+ nltk.data.find(resource_path)
41
+ return
42
+ except LookupError:
43
+ print(f"Resource '{resource}' still not found after download. Checking for a ZIP file...")
44
+
45
+ # Look for a ZIP file in all known NLTK data directories.
46
+ for data_path in nltk.data.path:
47
+ zip_path = os.path.join(data_path, "corpora", f"{resource}.zip")
48
+ if os.path.isfile(zip_path):
49
+ print(f"Found ZIP file for '{resource}' at: {zip_path}")
50
+ target_dir = os.path.join(data_path, "corpora")
51
+ with zipfile.ZipFile(zip_path, "r") as z:
52
+ z.extractall(path=target_dir)
53
+ print(f"Unzipped '{resource}' successfully.")
54
+ break # Stop after unzipping the first found ZIP.
55
+
56
+ # Final check: Try to load the resource one last time.
57
+ try:
58
+ nltk.data.find(resource_path)
59
+ print(f"Resource '{resource}' is now available.")
60
+ except LookupError:
61
+ raise LookupError(f"Resource '{resource}' could not be found, downloaded, or unzipped.")
62
+
63
+
64
+ # Ensure the NLTK stopwords and WordNet resources are available
65
+ ensure_nltk_resource("stopwords")
66
+ ensure_nltk_resource("wordnet")
67
+ # Use NLTK's stopwords - load all languages
68
+ STOP_WORDS = set(word for lang in stopwords.fileids() for word in stopwords.words(lang))
69
+ # Initialize the WordNet lemmatizer, which is used for normalizing words
70
+ LEMMATIZER = WordNetLemmatizer()
71
+
72
+
73
+ def load_annotations(
74
+ network: nx.Graph, annotations_input: Dict[str, Any], min_nodes_per_term: int = 2
75
+ ) -> Dict[str, Any]:
76
+ """Convert annotations input to a sparse matrix and reindex based on the network's node labels.
77
+
78
+ Args:
79
+ network (nx.Graph): The network graph.
80
+ annotations_input (Dict[str, Any]): A dictionary with annotations.
81
+ min_nodes_per_term (int, optional): The minimum number of network nodes required for each annotation
82
+ term to be included. Defaults to 2.
83
+
84
+ Returns:
85
+ Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the sparse binary annotations
86
+ matrix.
87
+
88
+ Raises:
89
+ ValueError: If no annotations are found for the nodes in the network.
90
+ ValueError: If no annotations have at least min_nodes_per_term nodes in the network.
91
+ """
92
+ # Step 1: Map nodes and annotations to indices
93
+ node_label_order = [attr["label"] for _, attr in network.nodes(data=True) if "label" in attr]
94
+ node_to_idx = {node: i for i, node in enumerate(node_label_order)}
95
+ annotation_to_idx = {annotation: i for i, annotation in enumerate(annotations_input)}
96
+ # Step 2: Construct a sparse binary matrix directly
97
+ row = []
98
+ col = []
99
+ data = []
100
+ for annotation, nodes in annotations_input.items():
101
+ for node in nodes:
102
+ if node in node_to_idx and annotation in annotation_to_idx:
103
+ row.append(node_to_idx[node])
104
+ col.append(annotation_to_idx[annotation])
105
+ data.append(1)
106
+
107
+ # Create a sparse binary matrix
108
+ num_nodes = len(node_to_idx)
109
+ num_annotations = len(annotation_to_idx)
110
+ annotations_pivot = coo_matrix((data, (row, col)), shape=(num_nodes, num_annotations)).tocsr()
111
+ # Step 3: Filter out annotations with fewer than min_nodes_per_term occurrences
112
+ valid_annotations = annotations_pivot.sum(axis=0).A1 >= min_nodes_per_term
113
+ annotations_pivot = annotations_pivot[:, valid_annotations]
114
+ # Step 4: Raise errors for empty matrices
115
+ if annotations_pivot.nnz == 0:
116
+ raise ValueError("No terms found in the annotation file for the nodes in the network.")
117
+
118
+ num_remaining_annotations = annotations_pivot.shape[1]
119
+ if num_remaining_annotations == 0:
120
+ raise ValueError(
121
+ f"No annotation terms found with at least {min_nodes_per_term} nodes in the network."
122
+ )
123
+
124
+ # Step 5: Extract ordered nodes and annotations
125
+ ordered_nodes = tuple(node_label_order)
126
+ ordered_annotations = tuple(
127
+ annotation for annotation, is_valid in zip(annotation_to_idx, valid_annotations) if is_valid
128
+ )
129
+
130
+ # Log the filtering details
131
+ logger.info(f"Minimum number of nodes per annotation term: {min_nodes_per_term}")
132
+ logger.info(f"Number of input annotation terms: {num_annotations}")
133
+ logger.info(f"Number of remaining annotation terms: {num_remaining_annotations}")
134
+
135
+ return {
136
+ "ordered_nodes": ordered_nodes,
137
+ "ordered_annotations": ordered_annotations,
138
+ "matrix": annotations_pivot,
139
+ }
140
+
141
+
142
+ def define_top_annotations(
143
+ network: nx.Graph,
144
+ ordered_annotation_labels: List[str],
145
+ neighborhood_significance_sums: List[int],
146
+ significant_significance_matrix: np.ndarray,
147
+ significant_binary_significance_matrix: np.ndarray,
148
+ min_cluster_size: int = 5,
149
+ max_cluster_size: int = 1000,
150
+ ) -> pd.DataFrame:
151
+ """Define top annotations based on neighborhood significance sums and binary significance matrix.
152
+
153
+ Args:
154
+ network (NetworkX graph): The network graph.
155
+ ordered_annotation_labels (list of str): List of ordered annotation labels.
156
+ neighborhood_significance_sums (list of int): List of neighborhood significance sums.
157
+ significant_significance_matrix (np.ndarray): Enrichment matrix below alpha threshold.
158
+ significant_binary_significance_matrix (np.ndarray): Binary significance matrix below alpha threshold.
159
+ min_cluster_size (int, optional): Minimum cluster size. Defaults to 5.
160
+ max_cluster_size (int, optional): Maximum cluster size. Defaults to 1000.
161
+
162
+ Returns:
163
+ pd.DataFrame: DataFrame with top annotations and their properties.
164
+ """
165
+ # Sum the columns of the significant significance matrix (positive floating point values)
166
+ significant_significance_scores = significant_significance_matrix.sum(axis=0)
167
+ # Create DataFrame to store annotations, their neighborhood significance sums, and significance scores
168
+ annotations_significance_matrix = pd.DataFrame(
169
+ {
170
+ "id": range(len(ordered_annotation_labels)),
171
+ "full_terms": ordered_annotation_labels,
172
+ "significant_neighborhood_significance_sums": neighborhood_significance_sums,
173
+ "significant_significance_score": significant_significance_scores,
174
+ }
175
+ )
176
+ annotations_significance_matrix["significant_annotations"] = False
177
+ # Apply size constraints to identify potential significant annotations
178
+ annotations_significance_matrix.loc[
179
+ (
180
+ annotations_significance_matrix["significant_neighborhood_significance_sums"]
181
+ >= min_cluster_size
182
+ )
183
+ & (
184
+ annotations_significance_matrix["significant_neighborhood_significance_sums"]
185
+ <= max_cluster_size
186
+ ),
187
+ "significant_annotations",
188
+ ] = True
189
+ # Initialize columns for connected components analysis
190
+ annotations_significance_matrix["num_connected_components"] = 0
191
+ annotations_significance_matrix["size_connected_components"] = None
192
+ annotations_significance_matrix["size_connected_components"] = annotations_significance_matrix[
193
+ "size_connected_components"
194
+ ].astype(object)
195
+ annotations_significance_matrix["num_large_connected_components"] = 0
196
+
197
+ for attribute in annotations_significance_matrix.index.values[
198
+ annotations_significance_matrix["significant_annotations"]
199
+ ]:
200
+ # Identify significant neighborhoods based on the binary significance matrix
201
+ significant_neighborhoods = list(
202
+ compress(list(network), significant_binary_significance_matrix[:, attribute])
203
+ )
204
+ significant_network = nx.subgraph(network, significant_neighborhoods)
205
+ # Analyze connected components within the significant subnetwork
206
+ connected_components = sorted(
207
+ nx.connected_components(significant_network), key=len, reverse=True
208
+ )
209
+ size_connected_components = np.array([len(c) for c in connected_components])
210
+
211
+ # Filter the size of connected components by min_cluster_size and max_cluster_size
212
+ filtered_size_connected_components = size_connected_components[
213
+ (size_connected_components >= min_cluster_size)
214
+ & (size_connected_components <= max_cluster_size)
215
+ ]
216
+ # Calculate the number of connected components and large connected components
217
+ num_connected_components = len(connected_components)
218
+ num_large_connected_components = len(filtered_size_connected_components)
219
+
220
+ # Assign the number of connected components
221
+ annotations_significance_matrix.loc[attribute, "num_connected_components"] = (
222
+ num_connected_components
223
+ )
224
+ # Filter out attributes with more than one connected component
225
+ annotations_significance_matrix.loc[
226
+ annotations_significance_matrix["num_connected_components"] > 1,
227
+ "significant_annotations",
228
+ ] = False
229
+ # Assign the number of large connected components
230
+ annotations_significance_matrix.loc[attribute, "num_large_connected_components"] = (
231
+ num_large_connected_components
232
+ )
233
+ # Assign the size of connected components, ensuring it is always a list
234
+ annotations_significance_matrix.at[attribute, "size_connected_components"] = (
235
+ filtered_size_connected_components.tolist()
236
+ )
237
+
238
+ return annotations_significance_matrix
239
+
240
+
241
+ def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
242
+ """Generate a weighted description from words and their corresponding scores,
243
+ using improved weighting logic with normalization, lemmatization, and aggregation.
244
+
245
+ Args:
246
+ words_column (pd.Series): A pandas Series containing strings (phrases) to process.
247
+ scores_column (pd.Series): A pandas Series containing significance scores to weigh the terms.
248
+
249
+ Returns:
250
+ str: A coherent description formed from the most frequent and significant words.
251
+ """
252
+ # Normalize significance scores to [0,1]. If all scores are identical, use 1.
253
+ if scores_column.max() == scores_column.min():
254
+ normalized_scores = pd.Series([1] * len(scores_column), index=scores_column.index)
255
+ else:
256
+ normalized_scores = (scores_column - scores_column.min()) / (
257
+ scores_column.max() - scores_column.min()
258
+ )
259
+
260
+ # Accumulate weighted counts for each token (after cleaning and lemmatization)
261
+ weighted_counts = {}
262
+ for phrase, score in zip(words_column, normalized_scores):
263
+ # Tokenize the phrase
264
+ tokens = word_tokenize(str(phrase))
265
+ # Determine the weight (scale factor; here multiplying normalized score by 10)
266
+ weight = max(1, int((0 if pd.isna(score) else score) * 10))
267
+ for token in tokens:
268
+ # Clean token: lowercase and remove extraneous punctuation (but preserve intra-word hyphens)
269
+ token_clean = re.sub(r"[^\w\-]", "", token).strip()
270
+ if not token_clean:
271
+ continue
272
+ # Skip tokens that are pure numbers
273
+ if token_clean.isdigit():
274
+ continue
275
+ # Skip stopwords
276
+ if token_clean in STOP_WORDS:
277
+ continue
278
+ # Lemmatize the token to merge similar forms
279
+ token_norm = LEMMATIZER.lemmatize(token_clean)
280
+ weighted_counts[token_norm] = weighted_counts.get(token_norm, 0) + weight
281
+
282
+ # Reconstruct a weighted token list by repeating each token by its aggregated count.
283
+ weighted_words = []
284
+ for token, count in weighted_counts.items():
285
+ weighted_words.extend([token] * count)
286
+
287
+ # Combine tokens that match number-word patterns (e.g. "4-alpha") and remove pure numeric tokens.
288
+ combined_tokens = []
289
+ for token in weighted_words:
290
+ if re.match(r"^\d+-\w+", token):
291
+ combined_tokens.append(token)
292
+ elif token.replace(".", "", 1).isdigit():
293
+ continue
294
+ else:
295
+ combined_tokens.append(token)
296
+
297
+ # If the only token is numeric, return a default value.
298
+ if len(combined_tokens) == 1 and combined_tokens[0].isdigit():
299
+ return "N/A"
300
+
301
+ # Simplify the token list to remove near-duplicates based on the Jaccard index.
302
+ simplified_words = _simplify_word_list(combined_tokens)
303
+ # Generate a coherent description from the simplified words.
304
+ description = _generate_coherent_description(simplified_words)
305
+
306
+ return description
307
+
308
+
309
+ def _simplify_word_list(words: List[str], threshold: float = 0.80) -> List[str]:
310
+ """Filter out words that are too similar based on the Jaccard index,
311
+ keeping the word with the higher aggregated count.
312
+
313
+ Args:
314
+ words (List[str]): The list of tokens to be filtered.
315
+ threshold (float, optional): The similarity threshold for the Jaccard index. Defaults to 0.80.
316
+
317
+ Returns:
318
+ List[str]: A list of filtered words, where similar words are reduced to the most frequent one.
319
+ """
320
+ # Count the occurrences (which reflect the weighted importance)
321
+ word_counts = Counter(words)
322
+ filtered_words = []
323
+ used_words = set()
324
+
325
+ # Iterate through words sorted by descending weighted frequency
326
+ for word in sorted(word_counts, key=lambda w: word_counts[w], reverse=True):
327
+ if word in used_words:
328
+ continue
329
+
330
+ word_set = set(word)
331
+ # Find similar words (including the current word) based on the Jaccard index
332
+ similar_words = [
333
+ other_word
334
+ for other_word in word_counts
335
+ if _calculate_jaccard_index(word_set, set(other_word)) >= threshold
336
+ ]
337
+ # Choose the word with the highest weighted count among the similar group
338
+ similar_words.sort(key=lambda w: word_counts[w], reverse=True)
339
+ best_word = similar_words[0]
340
+ filtered_words.append(best_word)
341
+ used_words.update(similar_words)
342
+
343
+ # Preserve the original order (by frequency) from the filtered set
344
+ final_words = [word for word in words if word in filtered_words]
345
+
346
+ return final_words
347
+
348
+
349
+ def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
350
+ """Calculate the Jaccard index between two sets.
351
+
352
+ Args:
353
+ set1 (Set[Any]): The first set.
354
+ set2 (Set[Any]): The second set.
355
+
356
+ Returns:
357
+ float: The Jaccard index (intersection over union). Returns 0 if the union is empty.
358
+ """
359
+ intersection = len(set1.intersection(set2))
360
+ union = len(set1.union(set2))
361
+ return intersection / union if union else 0
362
+
363
+
364
+ def _generate_coherent_description(words: List[str]) -> str:
365
+ """Generate a coherent description from a list of words.
366
+
367
+ If there is only one unique entry, return it directly.
368
+ Otherwise, order the words by frequency and join them into a single string.
369
+
370
+ Args:
371
+ words (List[str]): A list of tokens.
372
+
373
+ Returns:
374
+ str: A coherent, space-separated description.
375
+ """
376
+ if not words:
377
+ return "N/A"
378
+
379
+ # If there is only one unique word, return it directly
380
+ unique_words = set(words)
381
+ if len(unique_words) == 1:
382
+ return list(unique_words)[0]
383
+
384
+ # Count weighted occurrences and sort in descending order.
385
+ word_counts = Counter(words)
386
+ most_common_words = [word for word, _ in word_counts.most_common()]
387
+ description = " ".join(most_common_words)
388
+
389
+ return description