risk-network 0.0.10__tar.gz → 0.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. risk_network-0.0.12/PKG-INFO +122 -0
  2. {risk_network-0.0.10 → risk_network-0.0.12}/README.md +7 -7
  3. {risk_network-0.0.10 → risk_network-0.0.12}/pyproject.toml +22 -6
  4. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/__init__.py +1 -1
  5. risk_network-0.0.12/src/risk/annotation/__init__.py +10 -0
  6. risk_network-0.0.10/risk/annotations/annotations.py → risk_network-0.0.12/src/risk/annotation/annotation.py +62 -102
  7. {risk_network-0.0.10/risk/annotations → risk_network-0.0.12/src/risk/annotation}/io.py +93 -92
  8. risk_network-0.0.12/src/risk/annotation/nltk_setup.py +86 -0
  9. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/log/__init__.py +1 -1
  10. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/log/parameters.py +26 -27
  11. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/neighborhoods/__init__.py +0 -1
  12. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/neighborhoods/api.py +38 -38
  13. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/neighborhoods/community.py +33 -4
  14. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/neighborhoods/domains.py +26 -28
  15. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/neighborhoods/neighborhoods.py +8 -2
  16. risk_network-0.0.12/src/risk/neighborhoods/stats/__init__.py +13 -0
  17. risk_network-0.0.12/src/risk/neighborhoods/stats/permutation/__init__.py +6 -0
  18. {risk_network-0.0.10/risk → risk_network-0.0.12/src/risk/neighborhoods}/stats/permutation/permutation.py +24 -21
  19. {risk_network-0.0.10/risk → risk_network-0.0.12/src/risk/neighborhoods}/stats/permutation/test_functions.py +5 -4
  20. risk_network-0.0.10/risk/stats/stat_tests.py → risk_network-0.0.12/src/risk/neighborhoods/stats/tests.py +62 -54
  21. risk_network-0.0.12/src/risk/network/__init__.py +4 -0
  22. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/graph/__init__.py +0 -2
  23. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/graph/api.py +19 -19
  24. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/graph/graph.py +73 -68
  25. risk_network-0.0.10/risk/stats/significance.py → risk_network-0.0.12/src/risk/network/graph/stats.py +2 -2
  26. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/graph/summary.py +12 -13
  27. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/io.py +163 -20
  28. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/__init__.py +0 -2
  29. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/api.py +1 -1
  30. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/canvas.py +36 -36
  31. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/contour.py +14 -15
  32. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/labels.py +303 -294
  33. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/network.py +6 -6
  34. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/plotter.py +8 -10
  35. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/utils/colors.py +15 -8
  36. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/utils/layout.py +3 -3
  37. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/risk.py +6 -7
  38. risk_network-0.0.12/src/risk_network.egg-info/PKG-INFO +122 -0
  39. risk_network-0.0.12/src/risk_network.egg-info/SOURCES.txt +50 -0
  40. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk_network.egg-info/requires.txt +1 -1
  41. risk_network-0.0.12/tests/test_load_annotation.py +291 -0
  42. risk_network-0.0.12/tests/test_load_graph.py +420 -0
  43. risk_network-0.0.12/tests/test_load_io_combinations.py +95 -0
  44. risk_network-0.0.12/tests/test_load_neighborhoods.py +455 -0
  45. risk_network-0.0.12/tests/test_load_network.py +401 -0
  46. risk_network-0.0.12/tests/test_load_plotter.py +1483 -0
  47. risk_network-0.0.12/tests/test_log.py +72 -0
  48. risk_network-0.0.10/MANIFEST.in +0 -20
  49. risk_network-0.0.10/PKG-INFO +0 -798
  50. risk_network-0.0.10/risk/annotations/__init__.py +0 -7
  51. risk_network-0.0.10/risk/network/__init__.py +0 -6
  52. risk_network-0.0.10/risk/network/geometry.py +0 -150
  53. risk_network-0.0.10/risk/stats/__init__.py +0 -15
  54. risk_network-0.0.10/risk/stats/permutation/__init__.py +0 -6
  55. risk_network-0.0.10/risk_network.egg-info/PKG-INFO +0 -798
  56. risk_network-0.0.10/risk_network.egg-info/SOURCES.txt +0 -45
  57. risk_network-0.0.10/setup.py +0 -67
  58. {risk_network-0.0.10 → risk_network-0.0.12}/LICENSE +0 -0
  59. {risk_network-0.0.10 → risk_network-0.0.12}/setup.cfg +0 -0
  60. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/log/console.py +0 -0
  61. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk_network.egg-info/dependency_links.txt +0 -0
  62. {risk_network-0.0.10 → risk_network-0.0.12/src}/risk_network.egg-info/top_level.txt +0 -0
@@ -0,0 +1,122 @@
1
+ Metadata-Version: 2.4
2
+ Name: risk-network
3
+ Version: 0.0.12
4
+ Summary: A Python package for biological network analysis
5
+ Author-email: Ira Horecka <ira89@icloud.com>
6
+ License: GPL-3.0-or-later
7
+ Project-URL: Homepage, https://github.com/riskportal/network
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
16
+ Classifier: Topic :: Scientific/Engineering :: Visualization
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Classifier: Development Status :: 4 - Beta
19
+ Requires-Python: >=3.8
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: ipywidgets
23
+ Requires-Dist: leidenalg
24
+ Requires-Dist: markov_clustering
25
+ Requires-Dist: matplotlib
26
+ Requires-Dist: networkx
27
+ Requires-Dist: nltk
28
+ Requires-Dist: numpy
29
+ Requires-Dist: openpyxl
30
+ Requires-Dist: pandas
31
+ Requires-Dist: python-igraph
32
+ Requires-Dist: python-louvain
33
+ Requires-Dist: scikit-learn
34
+ Requires-Dist: scipy
35
+ Requires-Dist: statsmodels
36
+ Requires-Dist: threadpoolctl
37
+ Requires-Dist: tqdm
38
+ Dynamic: license-file
39
+
40
+ # RISK Network
41
+
42
+ <p align="center">
43
+ <img src="https://i.imgur.com/8TleEJs.png" width="50%" />
44
+ </p>
45
+
46
+ <br>
47
+
48
+ ![Python](https://img.shields.io/badge/python-3.8%2B-yellow)
49
+ [![pypiv](https://img.shields.io/pypi/v/risk-network.svg)](https://pypi.python.org/pypi/risk-network)
50
+ ![License](https://img.shields.io/badge/license-GPLv3-purple)
51
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.xxxxxxx.svg)](https://doi.org/10.5281/zenodo.xxxxxxx)
52
+ ![Downloads](https://img.shields.io/pypi/dm/risk-network)
53
+ ![Tests](https://github.com/riskportal/network/actions/workflows/ci.yml/badge.svg)
54
+
55
+ **RISK** (Regional Inference of Significant Kinships) is a next-generation tool for biological network annotation and visualization. RISK integrates community detection-based clustering, rigorous statistical enrichment analysis, and a modular framework to uncover biologically meaningful relationships and generate high-resolution visualizations. RISK supports diverse data formats and is optimized for large-scale network analysis, making it a valuable resource for researchers in systems biology and beyond.
56
+
57
+ ## Documentation and Tutorial
58
+
59
+ Full documentation is available at:
60
+
61
+ - **Docs:** [https://riskportal.github.io/network-tutorial](https://riskportal.github.io/network-tutorial)
62
+ - **Tutorial Jupyter Notebook Repository:** [https://github.com/riskportal/network-tutorial](https://github.com/riskportal/network-tutorial)
63
+
64
+ ## Installation
65
+
66
+ RISK is compatible with Python 3.8 or later and runs on all major operating systems. To install the latest version of RISK, run:
67
+
68
+ ```bash
69
+ pip install risk-network --upgrade
70
+ ```
71
+
72
+ ## Features
73
+
74
+ - **Comprehensive Network Analysis**: Analyze biological networks (e.g., protein–protein interaction and genetic interaction networks) as well as non-biological networks.
75
+ - **Advanced Clustering Algorithms**: Supports Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap for identifying structured network regions.
76
+ - **Flexible Visualization**: Produce customizable, high-resolution network visualizations with kernel density estimate overlays, adjustable node and edge attributes, and export options in SVG, PNG, and PDF formats.
77
+ - **Efficient Data Handling**: Supports multiple input/output formats, including JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
78
+ - **Statistical Analysis**: Assess functional enrichment using hypergeometric, permutation (network-aware), binomial, chi-squared, Poisson, and z-score tests, ensuring statistical adaptability across datasets.
79
+ - **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
80
+
81
+ ## Example Usage
82
+
83
+ We applied RISK to a *Saccharomyces cerevisiae* protein–protein interaction network from Michaelis et al. (2023), filtering for proteins with six or more interactions to emphasize core functional relationships. RISK identified compact, statistically enriched clusters corresponding to biological processes such as ribosomal assembly and mitochondrial organization.
84
+
85
+ [![Figure 1](https://i.imgur.com/lJHJrJr.jpeg)](https://i.imgur.com/lJHJrJr.jpeg)
86
+
87
+ This figure highlights RISK’s capability to detect both established and novel functional modules within the yeast interactome.
88
+
89
+ ## Citation
90
+
91
+ If you use RISK in your research, please cite:
92
+
93
+ **Horecka et al.**, "RISK: a next-generation tool for biological network annotation and visualization", **Bioinformatics**, 2025. DOI: [10.1234/zenodo.xxxxxxx](https://doi.org/10.1234/zenodo.xxxxxxx)
94
+
95
+ ## Software Architecture and Implementation
96
+
97
+ RISK features a streamlined, modular architecture designed to meet diverse research needs. RISK’s modular design enables users to run individual components—such as clustering, statistical testing, or visualization—independently or in combination, depending on the analysis workflow. It includes dedicated modules for:
98
+
99
+ - **Data I/O**: Supports JSON, CSV, TSV, Excel, Cytoscape, and GPickle formats.
100
+ - **Clustering**: Supports multiple clustering methods, including Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap. Provides flexible distance metrics tailored to network structure.
101
+ - **Statistical Analysis**: Provides a suite of tests for overrepresentation analysis of annotations.
102
+ - **Visualization**: Offers customizable, high-resolution output in multiple formats, including SVG, PNG, and PDF.
103
+ - **Configuration Management**: Centralized parameters in risk.params ensure reproducibility and easy tuning for large-scale analyses.
104
+
105
+ ## Performance and Efficiency
106
+
107
+ Benchmarking results demonstrate that RISK efficiently scales to networks exceeding hundreds of thousands of edges, maintaining low execution times and optimal memory usage across statistical tests.
108
+
109
+ ## Contributing
110
+
111
+ We welcome contributions from the community:
112
+
113
+ - [Issues Tracker](https://github.com/riskportal/network/issues)
114
+ - [Source Code](https://github.com/riskportal/network/tree/main/risk)
115
+
116
+ ## Support
117
+
118
+ If you encounter issues or have suggestions for new features, please use the [Issues Tracker](https://github.com/riskportal/network/issues) on GitHub.
119
+
120
+ ## License
121
+
122
+ RISK is open source under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
@@ -17,7 +17,10 @@
17
17
 
18
18
  ## Documentation and Tutorial
19
19
 
20
- An interactive Jupyter notebook tutorial can be found [here](https://github.com/riskportal/network-tutorial). We highly recommend new users to consult the documentation and tutorial early on to fully utilize RISK's capabilities.
20
+ Full documentation is available at:
21
+
22
+ - **Docs:** [https://riskportal.github.io/network-tutorial](https://riskportal.github.io/network-tutorial)
23
+ - **Tutorial Jupyter Notebook Repository:** [https://github.com/riskportal/network-tutorial](https://github.com/riskportal/network-tutorial)
21
24
 
22
25
  ## Installation
23
26
 
@@ -33,7 +36,7 @@ pip install risk-network --upgrade
33
36
  - **Advanced Clustering Algorithms**: Supports Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap for identifying structured network regions.
34
37
  - **Flexible Visualization**: Produce customizable, high-resolution network visualizations with kernel density estimate overlays, adjustable node and edge attributes, and export options in SVG, PNG, and PDF formats.
35
38
  - **Efficient Data Handling**: Supports multiple input/output formats, including JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
36
- - **Statistical Analysis**: Assess functional enrichment using hypergeometric, permutation, binomial, chi-squared, Poisson, and z-score tests, ensuring statistical adaptability across datasets.
39
+ - **Statistical Analysis**: Assess functional enrichment using hypergeometric, permutation (network-aware), binomial, chi-squared, Poisson, and z-score tests, ensuring statistical adaptability across datasets.
37
40
  - **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
38
41
 
39
42
  ## Example Usage
@@ -52,12 +55,13 @@ If you use RISK in your research, please cite:
52
55
 
53
56
  ## Software Architecture and Implementation
54
57
 
55
- RISK features a streamlined, modular architecture designed to meet diverse research needs. It includes dedicated modules for:
58
+ RISK features a streamlined, modular architecture designed to meet diverse research needs. RISK’s modular design enables users to run individual components—such as clustering, statistical testing, or visualization—independently or in combination, depending on the analysis workflow. It includes dedicated modules for:
56
59
 
57
60
  - **Data I/O**: Supports JSON, CSV, TSV, Excel, Cytoscape, and GPickle formats.
58
61
  - **Clustering**: Supports multiple clustering methods, including Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap. Provides flexible distance metrics tailored to network structure.
59
62
  - **Statistical Analysis**: Provides a suite of tests for overrepresentation analysis of annotations.
60
63
  - **Visualization**: Offers customizable, high-resolution output in multiple formats, including SVG, PNG, and PDF.
64
+ - **Configuration Management**: Centralized parameters in risk.params ensure reproducibility and easy tuning for large-scale analyses.
61
65
 
62
66
  ## Performance and Efficiency
63
67
 
@@ -77,7 +81,3 @@ If you encounter issues or have suggestions for new features, please use the [Is
77
81
  ## License
78
82
 
79
83
  RISK is open source under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
80
-
81
- ---
82
-
83
- **Note**: For detailed documentation and to access the interactive tutorial, please visit the links above.
@@ -1,20 +1,19 @@
1
1
  [build-system]
2
- requires = ["setuptools", "wheel", "numpy"]
2
+ requires = ["setuptools", "numpy"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "risk-network"
7
- dynamic = ["version"] # Indicates that version is determined dynamically
7
+ dynamic = ["version"]
8
8
  description = "A Python package for biological network analysis"
9
9
  authors = [
10
10
  { name = "Ira Horecka", email = "ira89@icloud.com" },
11
11
  ]
12
12
  readme = "README.md"
13
- license = { file = "LICENSE" }
13
+ requires-python = ">=3.8"
14
14
  classifiers = [
15
15
  "Intended Audience :: Developers",
16
16
  "Intended Audience :: Science/Research",
17
- "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
18
17
  "Operating System :: OS Independent",
19
18
  "Programming Language :: Python :: 3",
20
19
  "Programming Language :: Python :: 3.8",
@@ -31,7 +30,7 @@ dependencies = [
31
30
  "markov_clustering",
32
31
  "matplotlib",
33
32
  "networkx",
34
- "nltk==3.8.1",
33
+ "nltk",
35
34
  "numpy",
36
35
  "openpyxl",
37
36
  "pandas",
@@ -43,4 +42,21 @@ dependencies = [
43
42
  "threadpoolctl",
44
43
  "tqdm",
45
44
  ]
46
- requires-python = ">=3.8"
45
+
46
+ [project.license]
47
+ text = "GPL-3.0-or-later"
48
+
49
+ [project.urls]
50
+ "Homepage" = "https://github.com/riskportal/network"
51
+
52
+ [tool.setuptools]
53
+ package-dir = {"" = "src"}
54
+
55
+ [tool.setuptools.packages.find]
56
+ where = ["src"]
57
+
58
+ [tool.setuptools.dynamic]
59
+ version = { attr = "risk.__version__" }
60
+
61
+ [tool.pytest.ini_options]
62
+ pythonpath = ["src"]
@@ -7,4 +7,4 @@ RISK: Regional Inference of Significant Kinships
7
7
 
8
8
  from risk.risk import RISK
9
9
 
10
- __version__ = "0.0.10"
10
+ __version__ = "0.0.12"
@@ -0,0 +1,10 @@
1
+ """
2
+ risk/annotation
3
+ ~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ from risk.annotation.annotation import (
7
+ define_top_annotation,
8
+ get_weighted_description,
9
+ )
10
+ from risk.annotation.io import AnnotationIO
@@ -1,88 +1,48 @@
1
1
  """
2
- risk/annotations/annotations
3
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2
+ risk/annotation/annotation
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~
4
4
  """
5
5
 
6
- import os
7
6
  import re
8
- import zipfile
9
7
  from collections import Counter
10
8
  from itertools import compress
11
9
  from typing import Any, Dict, List, Set
12
10
 
13
11
  import networkx as nx
14
- import nltk
15
12
  import numpy as np
16
13
  import pandas as pd
17
- from nltk.corpus import stopwords
18
- from nltk.stem import WordNetLemmatizer
19
14
  from nltk.tokenize import word_tokenize
15
+ from scipy.sparse import coo_matrix
20
16
 
17
+ from risk.annotation.nltk_setup import setup_nltk_resources
21
18
  from risk.log import logger
22
- from scipy.sparse import coo_matrix
23
19
 
24
20
 
25
- def ensure_nltk_resource(resource: str) -> None:
26
- """Ensure the specified NLTK resource is available."""
27
- # Define the path to the resource within the NLTK data directory
28
- resource_path = f"corpora/{resource}"
29
- # Check if the resource is already available.
30
- try:
31
- nltk.data.find(resource_path)
32
- return
33
- except LookupError:
34
- print(f"Resource '{resource}' not found. Attempting to download...")
35
-
36
- # Download the resource.
37
- nltk.download(resource)
38
- # Check again after downloading.
39
- try:
40
- nltk.data.find(resource_path)
41
- return
42
- except LookupError:
43
- print(f"Resource '{resource}' still not found after download. Checking for a ZIP file...")
44
-
45
- # Look for a ZIP file in all known NLTK data directories.
46
- for data_path in nltk.data.path:
47
- zip_path = os.path.join(data_path, "corpora", f"{resource}.zip")
48
- if os.path.isfile(zip_path):
49
- print(f"Found ZIP file for '{resource}' at: {zip_path}")
50
- target_dir = os.path.join(data_path, "corpora")
51
- with zipfile.ZipFile(zip_path, "r") as z:
52
- z.extractall(path=target_dir)
53
- print(f"Unzipped '{resource}' successfully.")
54
- break # Stop after unzipping the first found ZIP.
55
-
56
- # Final check: Try to check resource one last time. If it fails, rai
57
- try:
58
- nltk.data.find(resource_path)
59
- print(f"Resource '{resource}' is now available.")
60
- except LookupError:
61
- raise LookupError(f"Resource '{resource}' could not be found, downloaded, or unzipped.")
62
-
63
-
64
- # Ensure the NLTK stopwords and WordNet resources are available
65
- # punkt is known to have issues with the default download method, so we use a custom function if it fails
66
- try:
67
- ensure_nltk_resource("punkt")
68
- except LookupError:
69
- nltk.download("punkt")
70
- ensure_nltk_resource("stopwords")
71
- ensure_nltk_resource("wordnet")
72
- # Use NLTK's stopwords - load all languages
73
- STOP_WORDS = set(word for lang in stopwords.fileids() for word in stopwords.words(lang))
74
- # Initialize the WordNet lemmatizer, which is used for normalizing words
75
- LEMMATIZER = WordNetLemmatizer()
76
-
77
-
78
- def load_annotations(
79
- network: nx.Graph, annotations_input: Dict[str, Any], min_nodes_per_term: int = 2
21
+ def initialize_nltk():
22
+ """Initialize all required NLTK components."""
23
+ setup_nltk_resources()
24
+
25
+ # After resources are available, initialize the components
26
+ from nltk.corpus import stopwords
27
+ from nltk.stem import WordNetLemmatizer
28
+
29
+ global STOP_WORDS, LEMMATIZER
30
+ STOP_WORDS = set(stopwords.words("english"))
31
+ LEMMATIZER = WordNetLemmatizer()
32
+
33
+
34
+ # Initialize NLTK components
35
+ initialize_nltk()
36
+
37
+
38
+ def load_annotation(
39
+ network: nx.Graph, annotation_input: Dict[str, Any], min_nodes_per_term: int = 2
80
40
  ) -> Dict[str, Any]:
81
- """Convert annotations input to a sparse matrix and reindex based on the network's node labels.
41
+ """Convert annotation input to a sparse matrix and reindex based on the network's node labels.
82
42
 
83
43
  Args:
84
44
  network (nx.Graph): The network graph.
85
- annotations_input (Dict[str, Any]): A dictionary with annotations.
45
+ annotation_input (Dict[str, Any]): An annotation dictionary.
86
46
  min_nodes_per_term (int, optional): The minimum number of network nodes required for each annotation
87
47
  term to be included. Defaults to 2.
88
48
 
@@ -91,18 +51,18 @@ def load_annotations(
91
51
  matrix.
92
52
 
93
53
  Raises:
94
- ValueError: If no annotations are found for the nodes in the network.
95
- ValueError: If no annotations have at least min_nodes_per_term nodes in the network.
54
+ ValueError: If no annotation is found for the nodes in the network.
55
+ ValueError: If no annotation has at least min_nodes_per_term nodes in the network.
96
56
  """
97
57
  # Step 1: Map nodes and annotations to indices
98
58
  node_label_order = [attr["label"] for _, attr in network.nodes(data=True) if "label" in attr]
99
59
  node_to_idx = {node: i for i, node in enumerate(node_label_order)}
100
- annotation_to_idx = {annotation: i for i, annotation in enumerate(annotations_input)}
60
+ annotation_to_idx = {annotation: i for i, annotation in enumerate(annotation_input)}
101
61
  # Step 2: Construct a sparse binary matrix directly
102
62
  row = []
103
63
  col = []
104
64
  data = []
105
- for annotation, nodes in annotations_input.items():
65
+ for annotation, nodes in annotation_input.items():
106
66
  for node in nodes:
107
67
  if node in node_to_idx and annotation in annotation_to_idx:
108
68
  row.append(node_to_idx[node])
@@ -111,40 +71,40 @@ def load_annotations(
111
71
 
112
72
  # Create a sparse binary matrix
113
73
  num_nodes = len(node_to_idx)
114
- num_annotations = len(annotation_to_idx)
115
- annotations_pivot = coo_matrix((data, (row, col)), shape=(num_nodes, num_annotations)).tocsr()
74
+ num_annotation = len(annotation_to_idx)
75
+ annotation_pivot = coo_matrix((data, (row, col)), shape=(num_nodes, num_annotation)).tocsr()
116
76
  # Step 3: Filter out annotations with fewer than min_nodes_per_term occurrences
117
- valid_annotations = annotations_pivot.sum(axis=0).A1 >= min_nodes_per_term
118
- annotations_pivot = annotations_pivot[:, valid_annotations]
77
+ valid_annotation = annotation_pivot.sum(axis=0).A1 >= min_nodes_per_term
78
+ annotation_pivot = annotation_pivot[:, valid_annotation]
119
79
  # Step 4: Raise errors for empty matrices
120
- if annotations_pivot.nnz == 0:
80
+ if annotation_pivot.nnz == 0:
121
81
  raise ValueError("No terms found in the annotation file for the nodes in the network.")
122
82
 
123
- num_remaining_annotations = annotations_pivot.shape[1]
124
- if num_remaining_annotations == 0:
83
+ num_remaining_annotation = annotation_pivot.shape[1]
84
+ if num_remaining_annotation == 0:
125
85
  raise ValueError(
126
86
  f"No annotation terms found with at least {min_nodes_per_term} nodes in the network."
127
87
  )
128
88
 
129
89
  # Step 5: Extract ordered nodes and annotations
130
90
  ordered_nodes = tuple(node_label_order)
131
- ordered_annotations = tuple(
132
- annotation for annotation, is_valid in zip(annotation_to_idx, valid_annotations) if is_valid
91
+ ordered_annotation = tuple(
92
+ annotation for annotation, is_valid in zip(annotation_to_idx, valid_annotation) if is_valid
133
93
  )
134
94
 
135
95
  # Log the filtering details
136
96
  logger.info(f"Minimum number of nodes per annotation term: {min_nodes_per_term}")
137
- logger.info(f"Number of input annotation terms: {num_annotations}")
138
- logger.info(f"Number of remaining annotation terms: {num_remaining_annotations}")
97
+ logger.info(f"Number of input annotation terms: {num_annotation}")
98
+ logger.info(f"Number of remaining annotation terms: {num_remaining_annotation}")
139
99
 
140
100
  return {
141
101
  "ordered_nodes": ordered_nodes,
142
- "ordered_annotations": ordered_annotations,
143
- "matrix": annotations_pivot,
102
+ "ordered_annotation": ordered_annotation,
103
+ "matrix": annotation_pivot,
144
104
  }
145
105
 
146
106
 
147
- def define_top_annotations(
107
+ def define_top_annotation(
148
108
  network: nx.Graph,
149
109
  ordered_annotation_labels: List[str],
150
110
  neighborhood_significance_sums: List[int],
@@ -170,7 +130,7 @@ def define_top_annotations(
170
130
  # Sum the columns of the significant significance matrix (positive floating point values)
171
131
  significant_significance_scores = significant_significance_matrix.sum(axis=0)
172
132
  # Create DataFrame to store annotations, their neighborhood significance sums, and significance scores
173
- annotations_significance_matrix = pd.DataFrame(
133
+ annotation_significance_matrix = pd.DataFrame(
174
134
  {
175
135
  "id": range(len(ordered_annotation_labels)),
176
136
  "full_terms": ordered_annotation_labels,
@@ -178,29 +138,29 @@ def define_top_annotations(
178
138
  "significant_significance_score": significant_significance_scores,
179
139
  }
180
140
  )
181
- annotations_significance_matrix["significant_annotations"] = False
141
+ annotation_significance_matrix["significant_annotation"] = False
182
142
  # Apply size constraints to identify potential significant annotations
183
- annotations_significance_matrix.loc[
143
+ annotation_significance_matrix.loc[
184
144
  (
185
- annotations_significance_matrix["significant_neighborhood_significance_sums"]
145
+ annotation_significance_matrix["significant_neighborhood_significance_sums"]
186
146
  >= min_cluster_size
187
147
  )
188
148
  & (
189
- annotations_significance_matrix["significant_neighborhood_significance_sums"]
149
+ annotation_significance_matrix["significant_neighborhood_significance_sums"]
190
150
  <= max_cluster_size
191
151
  ),
192
- "significant_annotations",
152
+ "significant_annotation",
193
153
  ] = True
194
154
  # Initialize columns for connected components analysis
195
- annotations_significance_matrix["num_connected_components"] = 0
196
- annotations_significance_matrix["size_connected_components"] = None
197
- annotations_significance_matrix["size_connected_components"] = annotations_significance_matrix[
155
+ annotation_significance_matrix["num_connected_components"] = 0
156
+ annotation_significance_matrix["size_connected_components"] = None
157
+ annotation_significance_matrix["size_connected_components"] = annotation_significance_matrix[
198
158
  "size_connected_components"
199
159
  ].astype(object)
200
- annotations_significance_matrix["num_large_connected_components"] = 0
160
+ annotation_significance_matrix["num_large_connected_components"] = 0
201
161
 
202
- for attribute in annotations_significance_matrix.index.values[
203
- annotations_significance_matrix["significant_annotations"]
162
+ for attribute in annotation_significance_matrix.index.values[
163
+ annotation_significance_matrix["significant_annotation"]
204
164
  ]:
205
165
  # Identify significant neighborhoods based on the binary significance matrix
206
166
  significant_neighborhoods = list(
@@ -223,24 +183,24 @@ def define_top_annotations(
223
183
  num_large_connected_components = len(filtered_size_connected_components)
224
184
 
225
185
  # Assign the number of connected components
226
- annotations_significance_matrix.loc[attribute, "num_connected_components"] = (
186
+ annotation_significance_matrix.loc[attribute, "num_connected_components"] = (
227
187
  num_connected_components
228
188
  )
229
189
  # Filter out attributes with more than one connected component
230
- annotations_significance_matrix.loc[
231
- annotations_significance_matrix["num_connected_components"] > 1,
232
- "significant_annotations",
190
+ annotation_significance_matrix.loc[
191
+ annotation_significance_matrix["num_connected_components"] > 1,
192
+ "significant_annotation",
233
193
  ] = False
234
194
  # Assign the number of large connected components
235
- annotations_significance_matrix.loc[attribute, "num_large_connected_components"] = (
195
+ annotation_significance_matrix.loc[attribute, "num_large_connected_components"] = (
236
196
  num_large_connected_components
237
197
  )
238
198
  # Assign the size of connected components, ensuring it is always a list
239
- annotations_significance_matrix.at[attribute, "size_connected_components"] = (
199
+ annotation_significance_matrix.at[attribute, "size_connected_components"] = (
240
200
  filtered_size_connected_components.tolist()
241
201
  )
242
202
 
243
- return annotations_significance_matrix
203
+ return annotation_significance_matrix
244
204
 
245
205
 
246
206
  def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str: