risk-network 0.0.10__tar.gz → 0.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk_network-0.0.12/PKG-INFO +122 -0
- {risk_network-0.0.10 → risk_network-0.0.12}/README.md +7 -7
- {risk_network-0.0.10 → risk_network-0.0.12}/pyproject.toml +22 -6
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/__init__.py +1 -1
- risk_network-0.0.12/src/risk/annotation/__init__.py +10 -0
- risk_network-0.0.10/risk/annotations/annotations.py → risk_network-0.0.12/src/risk/annotation/annotation.py +62 -102
- {risk_network-0.0.10/risk/annotations → risk_network-0.0.12/src/risk/annotation}/io.py +93 -92
- risk_network-0.0.12/src/risk/annotation/nltk_setup.py +86 -0
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/log/__init__.py +1 -1
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/log/parameters.py +26 -27
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/neighborhoods/__init__.py +0 -1
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/neighborhoods/api.py +38 -38
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/neighborhoods/community.py +33 -4
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/neighborhoods/domains.py +26 -28
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/neighborhoods/neighborhoods.py +8 -2
- risk_network-0.0.12/src/risk/neighborhoods/stats/__init__.py +13 -0
- risk_network-0.0.12/src/risk/neighborhoods/stats/permutation/__init__.py +6 -0
- {risk_network-0.0.10/risk → risk_network-0.0.12/src/risk/neighborhoods}/stats/permutation/permutation.py +24 -21
- {risk_network-0.0.10/risk → risk_network-0.0.12/src/risk/neighborhoods}/stats/permutation/test_functions.py +5 -4
- risk_network-0.0.10/risk/stats/stat_tests.py → risk_network-0.0.12/src/risk/neighborhoods/stats/tests.py +62 -54
- risk_network-0.0.12/src/risk/network/__init__.py +4 -0
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/graph/__init__.py +0 -2
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/graph/api.py +19 -19
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/graph/graph.py +73 -68
- risk_network-0.0.10/risk/stats/significance.py → risk_network-0.0.12/src/risk/network/graph/stats.py +2 -2
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/graph/summary.py +12 -13
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/io.py +163 -20
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/__init__.py +0 -2
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/api.py +1 -1
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/canvas.py +36 -36
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/contour.py +14 -15
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/labels.py +303 -294
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/network.py +6 -6
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/plotter.py +8 -10
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/utils/colors.py +15 -8
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/network/plotter/utils/layout.py +3 -3
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/risk.py +6 -7
- risk_network-0.0.12/src/risk_network.egg-info/PKG-INFO +122 -0
- risk_network-0.0.12/src/risk_network.egg-info/SOURCES.txt +50 -0
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk_network.egg-info/requires.txt +1 -1
- risk_network-0.0.12/tests/test_load_annotation.py +291 -0
- risk_network-0.0.12/tests/test_load_graph.py +420 -0
- risk_network-0.0.12/tests/test_load_io_combinations.py +95 -0
- risk_network-0.0.12/tests/test_load_neighborhoods.py +455 -0
- risk_network-0.0.12/tests/test_load_network.py +401 -0
- risk_network-0.0.12/tests/test_load_plotter.py +1483 -0
- risk_network-0.0.12/tests/test_log.py +72 -0
- risk_network-0.0.10/MANIFEST.in +0 -20
- risk_network-0.0.10/PKG-INFO +0 -798
- risk_network-0.0.10/risk/annotations/__init__.py +0 -7
- risk_network-0.0.10/risk/network/__init__.py +0 -6
- risk_network-0.0.10/risk/network/geometry.py +0 -150
- risk_network-0.0.10/risk/stats/__init__.py +0 -15
- risk_network-0.0.10/risk/stats/permutation/__init__.py +0 -6
- risk_network-0.0.10/risk_network.egg-info/PKG-INFO +0 -798
- risk_network-0.0.10/risk_network.egg-info/SOURCES.txt +0 -45
- risk_network-0.0.10/setup.py +0 -67
- {risk_network-0.0.10 → risk_network-0.0.12}/LICENSE +0 -0
- {risk_network-0.0.10 → risk_network-0.0.12}/setup.cfg +0 -0
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk/log/console.py +0 -0
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk_network.egg-info/dependency_links.txt +0 -0
- {risk_network-0.0.10 → risk_network-0.0.12/src}/risk_network.egg-info/top_level.txt +0 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: risk-network
|
3
|
+
Version: 0.0.12
|
4
|
+
Summary: A Python package for biological network analysis
|
5
|
+
Author-email: Ira Horecka <ira89@icloud.com>
|
6
|
+
License: GPL-3.0-or-later
|
7
|
+
Project-URL: Homepage, https://github.com/riskportal/network
|
8
|
+
Classifier: Intended Audience :: Developers
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Visualization
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
18
|
+
Classifier: Development Status :: 4 - Beta
|
19
|
+
Requires-Python: >=3.8
|
20
|
+
Description-Content-Type: text/markdown
|
21
|
+
License-File: LICENSE
|
22
|
+
Requires-Dist: ipywidgets
|
23
|
+
Requires-Dist: leidenalg
|
24
|
+
Requires-Dist: markov_clustering
|
25
|
+
Requires-Dist: matplotlib
|
26
|
+
Requires-Dist: networkx
|
27
|
+
Requires-Dist: nltk
|
28
|
+
Requires-Dist: numpy
|
29
|
+
Requires-Dist: openpyxl
|
30
|
+
Requires-Dist: pandas
|
31
|
+
Requires-Dist: python-igraph
|
32
|
+
Requires-Dist: python-louvain
|
33
|
+
Requires-Dist: scikit-learn
|
34
|
+
Requires-Dist: scipy
|
35
|
+
Requires-Dist: statsmodels
|
36
|
+
Requires-Dist: threadpoolctl
|
37
|
+
Requires-Dist: tqdm
|
38
|
+
Dynamic: license-file
|
39
|
+
|
40
|
+
# RISK Network
|
41
|
+
|
42
|
+
<p align="center">
|
43
|
+
<img src="https://i.imgur.com/8TleEJs.png" width="50%" />
|
44
|
+
</p>
|
45
|
+
|
46
|
+
<br>
|
47
|
+
|
48
|
+

|
49
|
+
[](https://pypi.python.org/pypi/risk-network)
|
50
|
+

|
51
|
+
[](https://doi.org/10.5281/zenodo.xxxxxxx)
|
52
|
+

|
53
|
+

|
54
|
+
|
55
|
+
**RISK** (Regional Inference of Significant Kinships) is a next-generation tool for biological network annotation and visualization. RISK integrates community detection-based clustering, rigorous statistical enrichment analysis, and a modular framework to uncover biologically meaningful relationships and generate high-resolution visualizations. RISK supports diverse data formats and is optimized for large-scale network analysis, making it a valuable resource for researchers in systems biology and beyond.
|
56
|
+
|
57
|
+
## Documentation and Tutorial
|
58
|
+
|
59
|
+
Full documentation is available at:
|
60
|
+
|
61
|
+
- **Docs:** [https://riskportal.github.io/network-tutorial](https://riskportal.github.io/network-tutorial)
|
62
|
+
- **Tutorial Jupyter Notebook Repository:** [https://github.com/riskportal/network-tutorial](https://github.com/riskportal/network-tutorial)
|
63
|
+
|
64
|
+
## Installation
|
65
|
+
|
66
|
+
RISK is compatible with Python 3.8 or later and runs on all major operating systems. To install the latest version of RISK, run:
|
67
|
+
|
68
|
+
```bash
|
69
|
+
pip install risk-network --upgrade
|
70
|
+
```
|
71
|
+
|
72
|
+
## Features
|
73
|
+
|
74
|
+
- **Comprehensive Network Analysis**: Analyze biological networks (e.g., protein–protein interaction and genetic interaction networks) as well as non-biological networks.
|
75
|
+
- **Advanced Clustering Algorithms**: Supports Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap for identifying structured network regions.
|
76
|
+
- **Flexible Visualization**: Produce customizable, high-resolution network visualizations with kernel density estimate overlays, adjustable node and edge attributes, and export options in SVG, PNG, and PDF formats.
|
77
|
+
- **Efficient Data Handling**: Supports multiple input/output formats, including JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
|
78
|
+
- **Statistical Analysis**: Assess functional enrichment using hypergeometric, permutation (network-aware), binomial, chi-squared, Poisson, and z-score tests, ensuring statistical adaptability across datasets.
|
79
|
+
- **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
|
80
|
+
|
81
|
+
## Example Usage
|
82
|
+
|
83
|
+
We applied RISK to a *Saccharomyces cerevisiae* protein–protein interaction network from Michaelis et al. (2023), filtering for proteins with six or more interactions to emphasize core functional relationships. RISK identified compact, statistically enriched clusters corresponding to biological processes such as ribosomal assembly and mitochondrial organization.
|
84
|
+
|
85
|
+
[](https://i.imgur.com/lJHJrJr.jpeg)
|
86
|
+
|
87
|
+
This figure highlights RISK’s capability to detect both established and novel functional modules within the yeast interactome.
|
88
|
+
|
89
|
+
## Citation
|
90
|
+
|
91
|
+
If you use RISK in your research, please cite:
|
92
|
+
|
93
|
+
**Horecka et al.**, "RISK: a next-generation tool for biological network annotation and visualization", **Bioinformatics**, 2025. DOI: [10.1234/zenodo.xxxxxxx](https://doi.org/10.1234/zenodo.xxxxxxx)
|
94
|
+
|
95
|
+
## Software Architecture and Implementation
|
96
|
+
|
97
|
+
RISK features a streamlined, modular architecture designed to meet diverse research needs. RISK’s modular design enables users to run individual components—such as clustering, statistical testing, or visualization—independently or in combination, depending on the analysis workflow. It includes dedicated modules for:
|
98
|
+
|
99
|
+
- **Data I/O**: Supports JSON, CSV, TSV, Excel, Cytoscape, and GPickle formats.
|
100
|
+
- **Clustering**: Supports multiple clustering methods, including Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap. Provides flexible distance metrics tailored to network structure.
|
101
|
+
- **Statistical Analysis**: Provides a suite of tests for overrepresentation analysis of annotations.
|
102
|
+
- **Visualization**: Offers customizable, high-resolution output in multiple formats, including SVG, PNG, and PDF.
|
103
|
+
- **Configuration Management**: Centralized parameters in risk.params ensure reproducibility and easy tuning for large-scale analyses.
|
104
|
+
|
105
|
+
## Performance and Efficiency
|
106
|
+
|
107
|
+
Benchmarking results demonstrate that RISK efficiently scales to networks exceeding hundreds of thousands of edges, maintaining low execution times and optimal memory usage across statistical tests.
|
108
|
+
|
109
|
+
## Contributing
|
110
|
+
|
111
|
+
We welcome contributions from the community:
|
112
|
+
|
113
|
+
- [Issues Tracker](https://github.com/riskportal/network/issues)
|
114
|
+
- [Source Code](https://github.com/riskportal/network/tree/main/risk)
|
115
|
+
|
116
|
+
## Support
|
117
|
+
|
118
|
+
If you encounter issues or have suggestions for new features, please use the [Issues Tracker](https://github.com/riskportal/network/issues) on GitHub.
|
119
|
+
|
120
|
+
## License
|
121
|
+
|
122
|
+
RISK is open source under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
@@ -17,7 +17,10 @@
|
|
17
17
|
|
18
18
|
## Documentation and Tutorial
|
19
19
|
|
20
|
-
|
20
|
+
Full documentation is available at:
|
21
|
+
|
22
|
+
- **Docs:** [https://riskportal.github.io/network-tutorial](https://riskportal.github.io/network-tutorial)
|
23
|
+
- **Tutorial Jupyter Notebook Repository:** [https://github.com/riskportal/network-tutorial](https://github.com/riskportal/network-tutorial)
|
21
24
|
|
22
25
|
## Installation
|
23
26
|
|
@@ -33,7 +36,7 @@ pip install risk-network --upgrade
|
|
33
36
|
- **Advanced Clustering Algorithms**: Supports Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap for identifying structured network regions.
|
34
37
|
- **Flexible Visualization**: Produce customizable, high-resolution network visualizations with kernel density estimate overlays, adjustable node and edge attributes, and export options in SVG, PNG, and PDF formats.
|
35
38
|
- **Efficient Data Handling**: Supports multiple input/output formats, including JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
|
36
|
-
- **Statistical Analysis**: Assess functional enrichment using hypergeometric, permutation, binomial, chi-squared, Poisson, and z-score tests, ensuring statistical adaptability across datasets.
|
39
|
+
- **Statistical Analysis**: Assess functional enrichment using hypergeometric, permutation (network-aware), binomial, chi-squared, Poisson, and z-score tests, ensuring statistical adaptability across datasets.
|
37
40
|
- **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
|
38
41
|
|
39
42
|
## Example Usage
|
@@ -52,12 +55,13 @@ If you use RISK in your research, please cite:
|
|
52
55
|
|
53
56
|
## Software Architecture and Implementation
|
54
57
|
|
55
|
-
RISK features a streamlined, modular architecture designed to meet diverse research needs. It includes dedicated modules for:
|
58
|
+
RISK features a streamlined, modular architecture designed to meet diverse research needs. RISK’s modular design enables users to run individual components—such as clustering, statistical testing, or visualization—independently or in combination, depending on the analysis workflow. It includes dedicated modules for:
|
56
59
|
|
57
60
|
- **Data I/O**: Supports JSON, CSV, TSV, Excel, Cytoscape, and GPickle formats.
|
58
61
|
- **Clustering**: Supports multiple clustering methods, including Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap. Provides flexible distance metrics tailored to network structure.
|
59
62
|
- **Statistical Analysis**: Provides a suite of tests for overrepresentation analysis of annotations.
|
60
63
|
- **Visualization**: Offers customizable, high-resolution output in multiple formats, including SVG, PNG, and PDF.
|
64
|
+
- **Configuration Management**: Centralized parameters in risk.params ensure reproducibility and easy tuning for large-scale analyses.
|
61
65
|
|
62
66
|
## Performance and Efficiency
|
63
67
|
|
@@ -77,7 +81,3 @@ If you encounter issues or have suggestions for new features, please use the [Is
|
|
77
81
|
## License
|
78
82
|
|
79
83
|
RISK is open source under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
80
|
-
|
81
|
-
---
|
82
|
-
|
83
|
-
**Note**: For detailed documentation and to access the interactive tutorial, please visit the links above.
|
@@ -1,20 +1,19 @@
|
|
1
1
|
[build-system]
|
2
|
-
requires = ["setuptools", "
|
2
|
+
requires = ["setuptools", "numpy"]
|
3
3
|
build-backend = "setuptools.build_meta"
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "risk-network"
|
7
|
-
dynamic = ["version"]
|
7
|
+
dynamic = ["version"]
|
8
8
|
description = "A Python package for biological network analysis"
|
9
9
|
authors = [
|
10
10
|
{ name = "Ira Horecka", email = "ira89@icloud.com" },
|
11
11
|
]
|
12
12
|
readme = "README.md"
|
13
|
-
|
13
|
+
requires-python = ">=3.8"
|
14
14
|
classifiers = [
|
15
15
|
"Intended Audience :: Developers",
|
16
16
|
"Intended Audience :: Science/Research",
|
17
|
-
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
18
17
|
"Operating System :: OS Independent",
|
19
18
|
"Programming Language :: Python :: 3",
|
20
19
|
"Programming Language :: Python :: 3.8",
|
@@ -31,7 +30,7 @@ dependencies = [
|
|
31
30
|
"markov_clustering",
|
32
31
|
"matplotlib",
|
33
32
|
"networkx",
|
34
|
-
"nltk
|
33
|
+
"nltk",
|
35
34
|
"numpy",
|
36
35
|
"openpyxl",
|
37
36
|
"pandas",
|
@@ -43,4 +42,21 @@ dependencies = [
|
|
43
42
|
"threadpoolctl",
|
44
43
|
"tqdm",
|
45
44
|
]
|
46
|
-
|
45
|
+
|
46
|
+
[project.license]
|
47
|
+
text = "GPL-3.0-or-later"
|
48
|
+
|
49
|
+
[project.urls]
|
50
|
+
"Homepage" = "https://github.com/riskportal/network"
|
51
|
+
|
52
|
+
[tool.setuptools]
|
53
|
+
package-dir = {"" = "src"}
|
54
|
+
|
55
|
+
[tool.setuptools.packages.find]
|
56
|
+
where = ["src"]
|
57
|
+
|
58
|
+
[tool.setuptools.dynamic]
|
59
|
+
version = { attr = "risk.__version__" }
|
60
|
+
|
61
|
+
[tool.pytest.ini_options]
|
62
|
+
pythonpath = ["src"]
|
@@ -1,88 +1,48 @@
|
|
1
1
|
"""
|
2
|
-
risk/
|
3
|
-
|
2
|
+
risk/annotation/annotation
|
3
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
4
|
"""
|
5
5
|
|
6
|
-
import os
|
7
6
|
import re
|
8
|
-
import zipfile
|
9
7
|
from collections import Counter
|
10
8
|
from itertools import compress
|
11
9
|
from typing import Any, Dict, List, Set
|
12
10
|
|
13
11
|
import networkx as nx
|
14
|
-
import nltk
|
15
12
|
import numpy as np
|
16
13
|
import pandas as pd
|
17
|
-
from nltk.corpus import stopwords
|
18
|
-
from nltk.stem import WordNetLemmatizer
|
19
14
|
from nltk.tokenize import word_tokenize
|
15
|
+
from scipy.sparse import coo_matrix
|
20
16
|
|
17
|
+
from risk.annotation.nltk_setup import setup_nltk_resources
|
21
18
|
from risk.log import logger
|
22
|
-
from scipy.sparse import coo_matrix
|
23
19
|
|
24
20
|
|
25
|
-
def
|
26
|
-
"""
|
27
|
-
|
28
|
-
|
29
|
-
#
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
# Look for a ZIP file in all known NLTK data directories.
|
46
|
-
for data_path in nltk.data.path:
|
47
|
-
zip_path = os.path.join(data_path, "corpora", f"{resource}.zip")
|
48
|
-
if os.path.isfile(zip_path):
|
49
|
-
print(f"Found ZIP file for '{resource}' at: {zip_path}")
|
50
|
-
target_dir = os.path.join(data_path, "corpora")
|
51
|
-
with zipfile.ZipFile(zip_path, "r") as z:
|
52
|
-
z.extractall(path=target_dir)
|
53
|
-
print(f"Unzipped '{resource}' successfully.")
|
54
|
-
break # Stop after unzipping the first found ZIP.
|
55
|
-
|
56
|
-
# Final check: Try to check resource one last time. If it fails, rai
|
57
|
-
try:
|
58
|
-
nltk.data.find(resource_path)
|
59
|
-
print(f"Resource '{resource}' is now available.")
|
60
|
-
except LookupError:
|
61
|
-
raise LookupError(f"Resource '{resource}' could not be found, downloaded, or unzipped.")
|
62
|
-
|
63
|
-
|
64
|
-
# Ensure the NLTK stopwords and WordNet resources are available
|
65
|
-
# punkt is known to have issues with the default download method, so we use a custom function if it fails
|
66
|
-
try:
|
67
|
-
ensure_nltk_resource("punkt")
|
68
|
-
except LookupError:
|
69
|
-
nltk.download("punkt")
|
70
|
-
ensure_nltk_resource("stopwords")
|
71
|
-
ensure_nltk_resource("wordnet")
|
72
|
-
# Use NLTK's stopwords - load all languages
|
73
|
-
STOP_WORDS = set(word for lang in stopwords.fileids() for word in stopwords.words(lang))
|
74
|
-
# Initialize the WordNet lemmatizer, which is used for normalizing words
|
75
|
-
LEMMATIZER = WordNetLemmatizer()
|
76
|
-
|
77
|
-
|
78
|
-
def load_annotations(
|
79
|
-
network: nx.Graph, annotations_input: Dict[str, Any], min_nodes_per_term: int = 2
|
21
|
+
def initialize_nltk():
|
22
|
+
"""Initialize all required NLTK components."""
|
23
|
+
setup_nltk_resources()
|
24
|
+
|
25
|
+
# After resources are available, initialize the components
|
26
|
+
from nltk.corpus import stopwords
|
27
|
+
from nltk.stem import WordNetLemmatizer
|
28
|
+
|
29
|
+
global STOP_WORDS, LEMMATIZER
|
30
|
+
STOP_WORDS = set(stopwords.words("english"))
|
31
|
+
LEMMATIZER = WordNetLemmatizer()
|
32
|
+
|
33
|
+
|
34
|
+
# Initialize NLTK components
|
35
|
+
initialize_nltk()
|
36
|
+
|
37
|
+
|
38
|
+
def load_annotation(
|
39
|
+
network: nx.Graph, annotation_input: Dict[str, Any], min_nodes_per_term: int = 2
|
80
40
|
) -> Dict[str, Any]:
|
81
|
-
"""Convert
|
41
|
+
"""Convert annotation input to a sparse matrix and reindex based on the network's node labels.
|
82
42
|
|
83
43
|
Args:
|
84
44
|
network (nx.Graph): The network graph.
|
85
|
-
|
45
|
+
annotation_input (Dict[str, Any]): An annotation dictionary.
|
86
46
|
min_nodes_per_term (int, optional): The minimum number of network nodes required for each annotation
|
87
47
|
term to be included. Defaults to 2.
|
88
48
|
|
@@ -91,18 +51,18 @@ def load_annotations(
|
|
91
51
|
matrix.
|
92
52
|
|
93
53
|
Raises:
|
94
|
-
ValueError: If no
|
95
|
-
ValueError: If no
|
54
|
+
ValueError: If no annotation is found for the nodes in the network.
|
55
|
+
ValueError: If no annotation has at least min_nodes_per_term nodes in the network.
|
96
56
|
"""
|
97
57
|
# Step 1: Map nodes and annotations to indices
|
98
58
|
node_label_order = [attr["label"] for _, attr in network.nodes(data=True) if "label" in attr]
|
99
59
|
node_to_idx = {node: i for i, node in enumerate(node_label_order)}
|
100
|
-
annotation_to_idx = {annotation: i for i, annotation in enumerate(
|
60
|
+
annotation_to_idx = {annotation: i for i, annotation in enumerate(annotation_input)}
|
101
61
|
# Step 2: Construct a sparse binary matrix directly
|
102
62
|
row = []
|
103
63
|
col = []
|
104
64
|
data = []
|
105
|
-
for annotation, nodes in
|
65
|
+
for annotation, nodes in annotation_input.items():
|
106
66
|
for node in nodes:
|
107
67
|
if node in node_to_idx and annotation in annotation_to_idx:
|
108
68
|
row.append(node_to_idx[node])
|
@@ -111,40 +71,40 @@ def load_annotations(
|
|
111
71
|
|
112
72
|
# Create a sparse binary matrix
|
113
73
|
num_nodes = len(node_to_idx)
|
114
|
-
|
115
|
-
|
74
|
+
num_annotation = len(annotation_to_idx)
|
75
|
+
annotation_pivot = coo_matrix((data, (row, col)), shape=(num_nodes, num_annotation)).tocsr()
|
116
76
|
# Step 3: Filter out annotations with fewer than min_nodes_per_term occurrences
|
117
|
-
|
118
|
-
|
77
|
+
valid_annotation = annotation_pivot.sum(axis=0).A1 >= min_nodes_per_term
|
78
|
+
annotation_pivot = annotation_pivot[:, valid_annotation]
|
119
79
|
# Step 4: Raise errors for empty matrices
|
120
|
-
if
|
80
|
+
if annotation_pivot.nnz == 0:
|
121
81
|
raise ValueError("No terms found in the annotation file for the nodes in the network.")
|
122
82
|
|
123
|
-
|
124
|
-
if
|
83
|
+
num_remaining_annotation = annotation_pivot.shape[1]
|
84
|
+
if num_remaining_annotation == 0:
|
125
85
|
raise ValueError(
|
126
86
|
f"No annotation terms found with at least {min_nodes_per_term} nodes in the network."
|
127
87
|
)
|
128
88
|
|
129
89
|
# Step 5: Extract ordered nodes and annotations
|
130
90
|
ordered_nodes = tuple(node_label_order)
|
131
|
-
|
132
|
-
annotation for annotation, is_valid in zip(annotation_to_idx,
|
91
|
+
ordered_annotation = tuple(
|
92
|
+
annotation for annotation, is_valid in zip(annotation_to_idx, valid_annotation) if is_valid
|
133
93
|
)
|
134
94
|
|
135
95
|
# Log the filtering details
|
136
96
|
logger.info(f"Minimum number of nodes per annotation term: {min_nodes_per_term}")
|
137
|
-
logger.info(f"Number of input annotation terms: {
|
138
|
-
logger.info(f"Number of remaining annotation terms: {
|
97
|
+
logger.info(f"Number of input annotation terms: {num_annotation}")
|
98
|
+
logger.info(f"Number of remaining annotation terms: {num_remaining_annotation}")
|
139
99
|
|
140
100
|
return {
|
141
101
|
"ordered_nodes": ordered_nodes,
|
142
|
-
"
|
143
|
-
"matrix":
|
102
|
+
"ordered_annotation": ordered_annotation,
|
103
|
+
"matrix": annotation_pivot,
|
144
104
|
}
|
145
105
|
|
146
106
|
|
147
|
-
def
|
107
|
+
def define_top_annotation(
|
148
108
|
network: nx.Graph,
|
149
109
|
ordered_annotation_labels: List[str],
|
150
110
|
neighborhood_significance_sums: List[int],
|
@@ -170,7 +130,7 @@ def define_top_annotations(
|
|
170
130
|
# Sum the columns of the significant significance matrix (positive floating point values)
|
171
131
|
significant_significance_scores = significant_significance_matrix.sum(axis=0)
|
172
132
|
# Create DataFrame to store annotations, their neighborhood significance sums, and significance scores
|
173
|
-
|
133
|
+
annotation_significance_matrix = pd.DataFrame(
|
174
134
|
{
|
175
135
|
"id": range(len(ordered_annotation_labels)),
|
176
136
|
"full_terms": ordered_annotation_labels,
|
@@ -178,29 +138,29 @@ def define_top_annotations(
|
|
178
138
|
"significant_significance_score": significant_significance_scores,
|
179
139
|
}
|
180
140
|
)
|
181
|
-
|
141
|
+
annotation_significance_matrix["significant_annotation"] = False
|
182
142
|
# Apply size constraints to identify potential significant annotations
|
183
|
-
|
143
|
+
annotation_significance_matrix.loc[
|
184
144
|
(
|
185
|
-
|
145
|
+
annotation_significance_matrix["significant_neighborhood_significance_sums"]
|
186
146
|
>= min_cluster_size
|
187
147
|
)
|
188
148
|
& (
|
189
|
-
|
149
|
+
annotation_significance_matrix["significant_neighborhood_significance_sums"]
|
190
150
|
<= max_cluster_size
|
191
151
|
),
|
192
|
-
"
|
152
|
+
"significant_annotation",
|
193
153
|
] = True
|
194
154
|
# Initialize columns for connected components analysis
|
195
|
-
|
196
|
-
|
197
|
-
|
155
|
+
annotation_significance_matrix["num_connected_components"] = 0
|
156
|
+
annotation_significance_matrix["size_connected_components"] = None
|
157
|
+
annotation_significance_matrix["size_connected_components"] = annotation_significance_matrix[
|
198
158
|
"size_connected_components"
|
199
159
|
].astype(object)
|
200
|
-
|
160
|
+
annotation_significance_matrix["num_large_connected_components"] = 0
|
201
161
|
|
202
|
-
for attribute in
|
203
|
-
|
162
|
+
for attribute in annotation_significance_matrix.index.values[
|
163
|
+
annotation_significance_matrix["significant_annotation"]
|
204
164
|
]:
|
205
165
|
# Identify significant neighborhoods based on the binary significance matrix
|
206
166
|
significant_neighborhoods = list(
|
@@ -223,24 +183,24 @@ def define_top_annotations(
|
|
223
183
|
num_large_connected_components = len(filtered_size_connected_components)
|
224
184
|
|
225
185
|
# Assign the number of connected components
|
226
|
-
|
186
|
+
annotation_significance_matrix.loc[attribute, "num_connected_components"] = (
|
227
187
|
num_connected_components
|
228
188
|
)
|
229
189
|
# Filter out attributes with more than one connected component
|
230
|
-
|
231
|
-
|
232
|
-
"
|
190
|
+
annotation_significance_matrix.loc[
|
191
|
+
annotation_significance_matrix["num_connected_components"] > 1,
|
192
|
+
"significant_annotation",
|
233
193
|
] = False
|
234
194
|
# Assign the number of large connected components
|
235
|
-
|
195
|
+
annotation_significance_matrix.loc[attribute, "num_large_connected_components"] = (
|
236
196
|
num_large_connected_components
|
237
197
|
)
|
238
198
|
# Assign the size of connected components, ensuring it is always a list
|
239
|
-
|
199
|
+
annotation_significance_matrix.at[attribute, "size_connected_components"] = (
|
240
200
|
filtered_size_connected_components.tolist()
|
241
201
|
)
|
242
202
|
|
243
|
-
return
|
203
|
+
return annotation_significance_matrix
|
244
204
|
|
245
205
|
|
246
206
|
def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
|