risk-network 0.0.12b0__tar.gz → 0.0.12b1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. risk_network-0.0.12b1/PKG-INFO +122 -0
  2. {risk_network-0.0.12b0 → risk_network-0.0.12b1}/pyproject.toml +12 -6
  3. {risk_network-0.0.12b0 → risk_network-0.0.12b1}/src/risk/__init__.py +1 -1
  4. risk_network-0.0.12b1/src/risk/annotations/__init__.py +10 -0
  5. risk_network-0.0.12b1/src/risk/annotations/annotations.py +354 -0
  6. risk_network-0.0.12b1/src/risk/annotations/io.py +241 -0
  7. risk_network-0.0.12b1/src/risk/annotations/nltk_setup.py +86 -0
  8. risk_network-0.0.12b1/src/risk/log/__init__.py +11 -0
  9. risk_network-0.0.12b1/src/risk/log/console.py +141 -0
  10. risk_network-0.0.12b1/src/risk/log/parameters.py +171 -0
  11. risk_network-0.0.12b1/src/risk/neighborhoods/__init__.py +7 -0
  12. risk_network-0.0.12b1/src/risk/neighborhoods/api.py +442 -0
  13. risk_network-0.0.12b1/src/risk/neighborhoods/community.py +441 -0
  14. risk_network-0.0.12b1/src/risk/neighborhoods/domains.py +360 -0
  15. risk_network-0.0.12b1/src/risk/neighborhoods/neighborhoods.py +514 -0
  16. risk_network-0.0.12b1/src/risk/neighborhoods/stats/__init__.py +13 -0
  17. risk_network-0.0.12b1/src/risk/neighborhoods/stats/permutation/__init__.py +6 -0
  18. risk_network-0.0.12b1/src/risk/neighborhoods/stats/permutation/permutation.py +240 -0
  19. risk_network-0.0.12b1/src/risk/neighborhoods/stats/permutation/test_functions.py +70 -0
  20. risk_network-0.0.12b1/src/risk/neighborhoods/stats/tests.py +275 -0
  21. risk_network-0.0.12b1/src/risk/network/__init__.py +4 -0
  22. risk_network-0.0.12b1/src/risk/network/graph/__init__.py +4 -0
  23. risk_network-0.0.12b1/src/risk/network/graph/api.py +200 -0
  24. risk_network-0.0.12b1/src/risk/network/graph/graph.py +268 -0
  25. risk_network-0.0.12b1/src/risk/network/graph/stats.py +166 -0
  26. risk_network-0.0.12b1/src/risk/network/graph/summary.py +253 -0
  27. risk_network-0.0.12b1/src/risk/network/io.py +693 -0
  28. risk_network-0.0.12b1/src/risk/network/plotter/__init__.py +4 -0
  29. risk_network-0.0.12b1/src/risk/network/plotter/api.py +54 -0
  30. risk_network-0.0.12b1/src/risk/network/plotter/canvas.py +291 -0
  31. risk_network-0.0.12b1/src/risk/network/plotter/contour.py +329 -0
  32. risk_network-0.0.12b1/src/risk/network/plotter/labels.py +935 -0
  33. risk_network-0.0.12b1/src/risk/network/plotter/network.py +294 -0
  34. risk_network-0.0.12b1/src/risk/network/plotter/plotter.py +141 -0
  35. risk_network-0.0.12b1/src/risk/network/plotter/utils/colors.py +419 -0
  36. risk_network-0.0.12b1/src/risk/network/plotter/utils/layout.py +94 -0
  37. risk_network-0.0.12b1/src/risk_network.egg-info/PKG-INFO +122 -0
  38. risk_network-0.0.12b1/src/risk_network.egg-info/SOURCES.txt +50 -0
  39. risk_network-0.0.12b1/tests/test_load_annotations.py +291 -0
  40. risk_network-0.0.12b1/tests/test_load_graph.py +426 -0
  41. risk_network-0.0.12b1/tests/test_load_io_combinations.py +95 -0
  42. risk_network-0.0.12b1/tests/test_load_neighborhoods.py +455 -0
  43. risk_network-0.0.12b1/tests/test_load_network.py +401 -0
  44. risk_network-0.0.12b1/tests/test_load_plotter.py +1483 -0
  45. risk_network-0.0.12b1/tests/test_log.py +72 -0
  46. risk_network-0.0.12b0/MANIFEST.in +0 -20
  47. risk_network-0.0.12b0/PKG-INFO +0 -796
  48. risk_network-0.0.12b0/src/risk_network.egg-info/PKG-INFO +0 -796
  49. risk_network-0.0.12b0/src/risk_network.egg-info/SOURCES.txt +0 -11
  50. {risk_network-0.0.12b0 → risk_network-0.0.12b1}/LICENSE +0 -0
  51. {risk_network-0.0.12b0 → risk_network-0.0.12b1}/README.md +0 -0
  52. {risk_network-0.0.12b0 → risk_network-0.0.12b1}/setup.cfg +0 -0
  53. {risk_network-0.0.12b0 → risk_network-0.0.12b1}/src/risk/risk.py +0 -0
  54. {risk_network-0.0.12b0 → risk_network-0.0.12b1}/src/risk_network.egg-info/dependency_links.txt +0 -0
  55. {risk_network-0.0.12b0 → risk_network-0.0.12b1}/src/risk_network.egg-info/requires.txt +0 -0
  56. {risk_network-0.0.12b0 → risk_network-0.0.12b1}/src/risk_network.egg-info/top_level.txt +0 -0
@@ -0,0 +1,122 @@
1
+ Metadata-Version: 2.4
2
+ Name: risk-network
3
+ Version: 0.0.12b1
4
+ Summary: A Python package for biological network analysis
5
+ Author-email: Ira Horecka <ira89@icloud.com>
6
+ License: GPL-3.0-or-later
7
+ Project-URL: Homepage, https://github.com/riskportal/network
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
16
+ Classifier: Topic :: Scientific/Engineering :: Visualization
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Classifier: Development Status :: 4 - Beta
19
+ Requires-Python: >=3.8
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: ipywidgets
23
+ Requires-Dist: leidenalg
24
+ Requires-Dist: markov_clustering
25
+ Requires-Dist: matplotlib
26
+ Requires-Dist: networkx
27
+ Requires-Dist: nltk
28
+ Requires-Dist: numpy
29
+ Requires-Dist: openpyxl
30
+ Requires-Dist: pandas
31
+ Requires-Dist: python-igraph
32
+ Requires-Dist: python-louvain
33
+ Requires-Dist: scikit-learn
34
+ Requires-Dist: scipy
35
+ Requires-Dist: statsmodels
36
+ Requires-Dist: threadpoolctl
37
+ Requires-Dist: tqdm
38
+ Dynamic: license-file
39
+
40
+ # RISK Network
41
+
42
+ <p align="center">
43
+ <img src="https://i.imgur.com/8TleEJs.png" width="50%" />
44
+ </p>
45
+
46
+ <br>
47
+
48
+ ![Python](https://img.shields.io/badge/python-3.8%2B-yellow)
49
+ [![pypiv](https://img.shields.io/pypi/v/risk-network.svg)](https://pypi.python.org/pypi/risk-network)
50
+ ![License](https://img.shields.io/badge/license-GPLv3-purple)
51
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.xxxxxxx.svg)](https://doi.org/10.5281/zenodo.xxxxxxx)
52
+ ![Downloads](https://img.shields.io/pypi/dm/risk-network)
53
+ ![Tests](https://github.com/riskportal/network/actions/workflows/ci.yml/badge.svg)
54
+
55
+ **RISK** (Regional Inference of Significant Kinships) is a next-generation tool for biological network annotation and visualization. RISK integrates community detection-based clustering, rigorous statistical enrichment analysis, and a modular framework to uncover biologically meaningful relationships and generate high-resolution visualizations. RISK supports diverse data formats and is optimized for large-scale network analysis, making it a valuable resource for researchers in systems biology and beyond.
56
+
57
+ ## Documentation and Tutorial
58
+
59
+ Full documentation is available at:
60
+
61
+ - **Docs:** [https://riskportal.github.io/network-tutorial](https://riskportal.github.io/network-tutorial)
62
+ - **Tutorial Jupyter Notebook Repository:** [https://github.com/riskportal/network-tutorial](https://github.com/riskportal/network-tutorial)
63
+
64
+ ## Installation
65
+
66
+ RISK is compatible with Python 3.8 or later and runs on all major operating systems. To install the latest version of RISK, run:
67
+
68
+ ```bash
69
+ pip install risk-network --upgrade
70
+ ```
71
+
72
+ ## Features
73
+
74
+ - **Comprehensive Network Analysis**: Analyze biological networks (e.g., protein–protein interaction and genetic interaction networks) as well as non-biological networks.
75
+ - **Advanced Clustering Algorithms**: Supports Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap for identifying structured network regions.
76
+ - **Flexible Visualization**: Produce customizable, high-resolution network visualizations with kernel density estimate overlays, adjustable node and edge attributes, and export options in SVG, PNG, and PDF formats.
77
+ - **Efficient Data Handling**: Supports multiple input/output formats, including JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
78
+ - **Statistical Analysis**: Assess functional enrichment using hypergeometric, permutation (network-aware), binomial, chi-squared, Poisson, and z-score tests, ensuring statistical adaptability across datasets.
79
+ - **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
80
+
81
+ ## Example Usage
82
+
83
+ We applied RISK to a *Saccharomyces cerevisiae* protein–protein interaction network from Michaelis et al. (2023), filtering for proteins with six or more interactions to emphasize core functional relationships. RISK identified compact, statistically enriched clusters corresponding to biological processes such as ribosomal assembly and mitochondrial organization.
84
+
85
+ [![Figure 1](https://i.imgur.com/lJHJrJr.jpeg)](https://i.imgur.com/lJHJrJr.jpeg)
86
+
87
+ This figure highlights RISK’s capability to detect both established and novel functional modules within the yeast interactome.
88
+
89
+ ## Citation
90
+
91
+ If you use RISK in your research, please cite:
92
+
93
+ **Horecka et al.**, "RISK: a next-generation tool for biological network annotation and visualization", **Bioinformatics**, 2025. DOI: [10.1234/zenodo.xxxxxxx](https://doi.org/10.1234/zenodo.xxxxxxx)
94
+
95
+ ## Software Architecture and Implementation
96
+
97
+ RISK features a streamlined, modular architecture designed to meet diverse research needs. RISK’s modular design enables users to run individual components—such as clustering, statistical testing, or visualization—independently or in combination, depending on the analysis workflow. It includes dedicated modules for:
98
+
99
+ - **Data I/O**: Supports JSON, CSV, TSV, Excel, Cytoscape, and GPickle formats.
100
+ - **Clustering**: Supports multiple clustering methods, including Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap. Provides flexible distance metrics tailored to network structure.
101
+ - **Statistical Analysis**: Provides a suite of tests for overrepresentation analysis of annotations.
102
+ - **Visualization**: Offers customizable, high-resolution output in multiple formats, including SVG, PNG, and PDF.
103
+ - **Configuration Management**: Centralized parameters in risk.params ensure reproducibility and easy tuning for large-scale analyses.
104
+
105
+ ## Performance and Efficiency
106
+
107
+ Benchmarking results demonstrate that RISK efficiently scales to networks exceeding hundreds of thousands of edges, maintaining low execution times and optimal memory usage across statistical tests.
108
+
109
+ ## Contributing
110
+
111
+ We welcome contributions from the community:
112
+
113
+ - [Issues Tracker](https://github.com/riskportal/network/issues)
114
+ - [Source Code](https://github.com/riskportal/network/tree/main/risk)
115
+
116
+ ## Support
117
+
118
+ If you encounter issues or have suggestions for new features, please use the [Issues Tracker](https://github.com/riskportal/network/issues) on GitHub.
119
+
120
+ ## License
121
+
122
+ RISK is open source under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
@@ -1,20 +1,19 @@
1
1
  [build-system]
2
- requires = ["setuptools >= 77.0.3", "numpy"]
2
+ requires = ["setuptools", "numpy"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "risk-network"
7
- dynamic = ["version"] # Indicates that version is determined dynamically
7
+ dynamic = ["version"]
8
8
  description = "A Python package for biological network analysis"
9
9
  authors = [
10
10
  { name = "Ira Horecka", email = "ira89@icloud.com" },
11
11
  ]
12
12
  readme = "README.md"
13
- license = { file = "LICENSE" }
13
+ requires-python = ">=3.8"
14
14
  classifiers = [
15
15
  "Intended Audience :: Developers",
16
16
  "Intended Audience :: Science/Research",
17
- "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
18
17
  "Operating System :: OS Independent",
19
18
  "Programming Language :: Python :: 3",
20
19
  "Programming Language :: Python :: 3.8",
@@ -43,11 +42,18 @@ dependencies = [
43
42
  "threadpoolctl",
44
43
  "tqdm",
45
44
  ]
46
- requires-python = ">=3.8"
45
+
46
+ [project.license]
47
+ text = "GPL-3.0-or-later"
48
+
49
+ [project.urls]
50
+ "Homepage" = "https://github.com/riskportal/network"
47
51
 
48
52
  [tool.setuptools]
49
53
  package-dir = {"" = "src"}
50
- packages = ["risk"]
54
+
55
+ [tool.setuptools.packages.find]
56
+ where = ["src"]
51
57
 
52
58
  [tool.setuptools.dynamic]
53
59
  version = { attr = "risk.__version__" }
@@ -7,4 +7,4 @@ RISK: Regional Inference of Significant Kinships
7
7
 
8
8
  from risk.risk import RISK
9
9
 
10
- __version__ = "0.0.12-beta.0"
10
+ __version__ = "0.0.12-beta.1"
@@ -0,0 +1,10 @@
1
+ """
2
+ risk/annotations
3
+ ~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ from risk.annotations.annotations import (
7
+ define_top_annotations,
8
+ get_weighted_description,
9
+ )
10
+ from risk.annotations.io import AnnotationsIO
@@ -0,0 +1,354 @@
1
+ """
2
+ risk/annotations/annotations
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ import re
7
+ from collections import Counter
8
+ from itertools import compress
9
+ from typing import Any, Dict, List, Set
10
+
11
+ import networkx as nx
12
+ import numpy as np
13
+ import pandas as pd
14
+ from nltk.tokenize import word_tokenize
15
+ from scipy.sparse import coo_matrix
16
+
17
+ from risk.annotations.nltk_setup import setup_nltk_resources
18
+ from risk.log import logger
19
+
20
+
21
+ def initialize_nltk():
22
+ """Initialize all required NLTK components."""
23
+ setup_nltk_resources()
24
+
25
+ # After resources are available, initialize the components
26
+ from nltk.corpus import stopwords
27
+ from nltk.stem import WordNetLemmatizer
28
+
29
+ global STOP_WORDS, LEMMATIZER
30
+ STOP_WORDS = set(stopwords.words("english"))
31
+ LEMMATIZER = WordNetLemmatizer()
32
+
33
+
34
+ # Initialize NLTK components
35
+ initialize_nltk()
36
+
37
+
38
+ def load_annotations(
39
+ network: nx.Graph, annotations_input: Dict[str, Any], min_nodes_per_term: int = 2
40
+ ) -> Dict[str, Any]:
41
+ """Convert annotations input to a sparse matrix and reindex based on the network's node labels.
42
+
43
+ Args:
44
+ network (nx.Graph): The network graph.
45
+ annotations_input (Dict[str, Any]): A dictionary with annotations.
46
+ min_nodes_per_term (int, optional): The minimum number of network nodes required for each annotation
47
+ term to be included. Defaults to 2.
48
+
49
+ Returns:
50
+ Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the sparse binary annotations
51
+ matrix.
52
+
53
+ Raises:
54
+ ValueError: If no annotations are found for the nodes in the network.
55
+ ValueError: If no annotations have at least min_nodes_per_term nodes in the network.
56
+ """
57
+ # Step 1: Map nodes and annotations to indices
58
+ node_label_order = [attr["label"] for _, attr in network.nodes(data=True) if "label" in attr]
59
+ node_to_idx = {node: i for i, node in enumerate(node_label_order)}
60
+ annotation_to_idx = {annotation: i for i, annotation in enumerate(annotations_input)}
61
+ # Step 2: Construct a sparse binary matrix directly
62
+ row = []
63
+ col = []
64
+ data = []
65
+ for annotation, nodes in annotations_input.items():
66
+ for node in nodes:
67
+ if node in node_to_idx and annotation in annotation_to_idx:
68
+ row.append(node_to_idx[node])
69
+ col.append(annotation_to_idx[annotation])
70
+ data.append(1)
71
+
72
+ # Create a sparse binary matrix
73
+ num_nodes = len(node_to_idx)
74
+ num_annotations = len(annotation_to_idx)
75
+ annotations_pivot = coo_matrix((data, (row, col)), shape=(num_nodes, num_annotations)).tocsr()
76
+ # Step 3: Filter out annotations with fewer than min_nodes_per_term occurrences
77
+ valid_annotations = annotations_pivot.sum(axis=0).A1 >= min_nodes_per_term
78
+ annotations_pivot = annotations_pivot[:, valid_annotations]
79
+ # Step 4: Raise errors for empty matrices
80
+ if annotations_pivot.nnz == 0:
81
+ raise ValueError("No terms found in the annotation file for the nodes in the network.")
82
+
83
+ num_remaining_annotations = annotations_pivot.shape[1]
84
+ if num_remaining_annotations == 0:
85
+ raise ValueError(
86
+ f"No annotation terms found with at least {min_nodes_per_term} nodes in the network."
87
+ )
88
+
89
+ # Step 5: Extract ordered nodes and annotations
90
+ ordered_nodes = tuple(node_label_order)
91
+ ordered_annotations = tuple(
92
+ annotation for annotation, is_valid in zip(annotation_to_idx, valid_annotations) if is_valid
93
+ )
94
+
95
+ # Log the filtering details
96
+ logger.info(f"Minimum number of nodes per annotation term: {min_nodes_per_term}")
97
+ logger.info(f"Number of input annotation terms: {num_annotations}")
98
+ logger.info(f"Number of remaining annotation terms: {num_remaining_annotations}")
99
+
100
+ return {
101
+ "ordered_nodes": ordered_nodes,
102
+ "ordered_annotations": ordered_annotations,
103
+ "matrix": annotations_pivot,
104
+ }
105
+
106
+
107
+ def define_top_annotations(
108
+ network: nx.Graph,
109
+ ordered_annotation_labels: List[str],
110
+ neighborhood_significance_sums: List[int],
111
+ significant_significance_matrix: np.ndarray,
112
+ significant_binary_significance_matrix: np.ndarray,
113
+ min_cluster_size: int = 5,
114
+ max_cluster_size: int = 1000,
115
+ ) -> pd.DataFrame:
116
+ """Define top annotations based on neighborhood significance sums and binary significance matrix.
117
+
118
+ Args:
119
+ network (NetworkX graph): The network graph.
120
+ ordered_annotation_labels (list of str): List of ordered annotation labels.
121
+ neighborhood_significance_sums (list of int): List of neighborhood significance sums.
122
+ significant_significance_matrix (np.ndarray): Enrichment matrix below alpha threshold.
123
+ significant_binary_significance_matrix (np.ndarray): Binary significance matrix below alpha threshold.
124
+ min_cluster_size (int, optional): Minimum cluster size. Defaults to 5.
125
+ max_cluster_size (int, optional): Maximum cluster size. Defaults to 1000.
126
+
127
+ Returns:
128
+ pd.DataFrame: DataFrame with top annotations and their properties.
129
+ """
130
+ # Sum the columns of the significant significance matrix (positive floating point values)
131
+ significant_significance_scores = significant_significance_matrix.sum(axis=0)
132
+ # Create DataFrame to store annotations, their neighborhood significance sums, and significance scores
133
+ annotations_significance_matrix = pd.DataFrame(
134
+ {
135
+ "id": range(len(ordered_annotation_labels)),
136
+ "full_terms": ordered_annotation_labels,
137
+ "significant_neighborhood_significance_sums": neighborhood_significance_sums,
138
+ "significant_significance_score": significant_significance_scores,
139
+ }
140
+ )
141
+ annotations_significance_matrix["significant_annotations"] = False
142
+ # Apply size constraints to identify potential significant annotations
143
+ annotations_significance_matrix.loc[
144
+ (
145
+ annotations_significance_matrix["significant_neighborhood_significance_sums"]
146
+ >= min_cluster_size
147
+ )
148
+ & (
149
+ annotations_significance_matrix["significant_neighborhood_significance_sums"]
150
+ <= max_cluster_size
151
+ ),
152
+ "significant_annotations",
153
+ ] = True
154
+ # Initialize columns for connected components analysis
155
+ annotations_significance_matrix["num_connected_components"] = 0
156
+ annotations_significance_matrix["size_connected_components"] = None
157
+ annotations_significance_matrix["size_connected_components"] = annotations_significance_matrix[
158
+ "size_connected_components"
159
+ ].astype(object)
160
+ annotations_significance_matrix["num_large_connected_components"] = 0
161
+
162
+ for attribute in annotations_significance_matrix.index.values[
163
+ annotations_significance_matrix["significant_annotations"]
164
+ ]:
165
+ # Identify significant neighborhoods based on the binary significance matrix
166
+ significant_neighborhoods = list(
167
+ compress(list(network), significant_binary_significance_matrix[:, attribute])
168
+ )
169
+ significant_network = nx.subgraph(network, significant_neighborhoods)
170
+ # Analyze connected components within the significant subnetwork
171
+ connected_components = sorted(
172
+ nx.connected_components(significant_network), key=len, reverse=True
173
+ )
174
+ size_connected_components = np.array([len(c) for c in connected_components])
175
+
176
+ # Filter the size of connected components by min_cluster_size and max_cluster_size
177
+ filtered_size_connected_components = size_connected_components[
178
+ (size_connected_components >= min_cluster_size)
179
+ & (size_connected_components <= max_cluster_size)
180
+ ]
181
+ # Calculate the number of connected components and large connected components
182
+ num_connected_components = len(connected_components)
183
+ num_large_connected_components = len(filtered_size_connected_components)
184
+
185
+ # Assign the number of connected components
186
+ annotations_significance_matrix.loc[attribute, "num_connected_components"] = (
187
+ num_connected_components
188
+ )
189
+ # Filter out attributes with more than one connected component
190
+ annotations_significance_matrix.loc[
191
+ annotations_significance_matrix["num_connected_components"] > 1,
192
+ "significant_annotations",
193
+ ] = False
194
+ # Assign the number of large connected components
195
+ annotations_significance_matrix.loc[attribute, "num_large_connected_components"] = (
196
+ num_large_connected_components
197
+ )
198
+ # Assign the size of connected components, ensuring it is always a list
199
+ annotations_significance_matrix.at[attribute, "size_connected_components"] = (
200
+ filtered_size_connected_components.tolist()
201
+ )
202
+
203
+ return annotations_significance_matrix
204
+
205
+
206
+ def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
207
+ """Generate a weighted description from words and their corresponding scores,
208
+ using improved weighting logic with normalization, lemmatization, and aggregation.
209
+
210
+ Args:
211
+ words_column (pd.Series): A pandas Series containing strings (phrases) to process.
212
+ scores_column (pd.Series): A pandas Series containing significance scores to weigh the terms.
213
+
214
+ Returns:
215
+ str: A coherent description formed from the most frequent and significant words.
216
+ """
217
+ # Normalize significance scores to [0,1]. If all scores are identical, use 1.
218
+ if scores_column.max() == scores_column.min():
219
+ normalized_scores = pd.Series([1] * len(scores_column), index=scores_column.index)
220
+ else:
221
+ normalized_scores = (scores_column - scores_column.min()) / (
222
+ scores_column.max() - scores_column.min()
223
+ )
224
+
225
+ # Accumulate weighted counts for each token (after cleaning and lemmatization)
226
+ weighted_counts = {}
227
+ for phrase, score in zip(words_column, normalized_scores):
228
+ # Tokenize the phrase
229
+ tokens = word_tokenize(str(phrase))
230
+ # Determine the weight (scale factor; here multiplying normalized score by 10)
231
+ weight = max(1, int((0 if pd.isna(score) else score) * 10))
232
+ for token in tokens:
233
+ # Clean token: lowercase and remove extraneous punctuation (but preserve intra-word hyphens)
234
+ token_clean = re.sub(r"[^\w\-]", "", token).strip()
235
+ if not token_clean:
236
+ continue
237
+ # Skip tokens that are pure numbers
238
+ if token_clean.isdigit():
239
+ continue
240
+ # Skip stopwords
241
+ if token_clean in STOP_WORDS:
242
+ continue
243
+ # Lemmatize the token to merge similar forms
244
+ token_norm = LEMMATIZER.lemmatize(token_clean)
245
+ weighted_counts[token_norm] = weighted_counts.get(token_norm, 0) + weight
246
+
247
+ # Reconstruct a weighted token list by repeating each token by its aggregated count.
248
+ weighted_words = []
249
+ for token, count in weighted_counts.items():
250
+ weighted_words.extend([token] * count)
251
+
252
+ # Combine tokens that match number-word patterns (e.g. "4-alpha") and remove pure numeric tokens.
253
+ combined_tokens = []
254
+ for token in weighted_words:
255
+ if re.match(r"^\d+-\w+", token):
256
+ combined_tokens.append(token)
257
+ elif token.replace(".", "", 1).isdigit():
258
+ continue
259
+ else:
260
+ combined_tokens.append(token)
261
+
262
+ # If the only token is numeric, return a default value.
263
+ if len(combined_tokens) == 1 and combined_tokens[0].isdigit():
264
+ return "N/A"
265
+
266
+ # Simplify the token list to remove near-duplicates based on the Jaccard index.
267
+ simplified_words = _simplify_word_list(combined_tokens)
268
+ # Generate a coherent description from the simplified words.
269
+ description = _generate_coherent_description(simplified_words)
270
+
271
+ return description
272
+
273
+
274
+ def _simplify_word_list(words: List[str], threshold: float = 0.80) -> List[str]:
275
+ """Filter out words that are too similar based on the Jaccard index,
276
+ keeping the word with the higher aggregated count.
277
+
278
+ Args:
279
+ words (List[str]): The list of tokens to be filtered.
280
+ threshold (float, optional): The similarity threshold for the Jaccard index. Defaults to 0.80.
281
+
282
+ Returns:
283
+ List[str]: A list of filtered words, where similar words are reduced to the most frequent one.
284
+ """
285
+ # Count the occurrences (which reflect the weighted importance)
286
+ word_counts = Counter(words)
287
+ filtered_words = []
288
+ used_words = set()
289
+
290
+ # Iterate through words sorted by descending weighted frequency
291
+ for word in sorted(word_counts, key=lambda w: word_counts[w], reverse=True):
292
+ if word in used_words:
293
+ continue
294
+
295
+ word_set = set(word)
296
+ # Find similar words (including the current word) based on the Jaccard index
297
+ similar_words = [
298
+ other_word
299
+ for other_word in word_counts
300
+ if _calculate_jaccard_index(word_set, set(other_word)) >= threshold
301
+ ]
302
+ # Choose the word with the highest weighted count among the similar group
303
+ similar_words.sort(key=lambda w: word_counts[w], reverse=True)
304
+ best_word = similar_words[0]
305
+ filtered_words.append(best_word)
306
+ used_words.update(similar_words)
307
+
308
+ # Preserve the original order (by frequency) from the filtered set
309
+ final_words = [word for word in words if word in filtered_words]
310
+
311
+ return final_words
312
+
313
+
314
+ def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
315
+ """Calculate the Jaccard index between two sets.
316
+
317
+ Args:
318
+ set1 (Set[Any]): The first set.
319
+ set2 (Set[Any]): The second set.
320
+
321
+ Returns:
322
+ float: The Jaccard index (intersection over union). Returns 0 if the union is empty.
323
+ """
324
+ intersection = len(set1.intersection(set2))
325
+ union = len(set1.union(set2))
326
+ return intersection / union if union else 0
327
+
328
+
329
+ def _generate_coherent_description(words: List[str]) -> str:
330
+ """Generate a coherent description from a list of words.
331
+
332
+ If there is only one unique entry, return it directly.
333
+ Otherwise, order the words by frequency and join them into a single string.
334
+
335
+ Args:
336
+ words (List[str]): A list of tokens.
337
+
338
+ Returns:
339
+ str: A coherent, space-separated description.
340
+ """
341
+ if not words:
342
+ return "N/A"
343
+
344
+ # If there is only one unique word, return it directly
345
+ unique_words = set(words)
346
+ if len(unique_words) == 1:
347
+ return list(unique_words)[0]
348
+
349
+ # Count weighted occurrences and sort in descending order.
350
+ word_counts = Counter(words)
351
+ most_common_words = [word for word, _ in word_counts.most_common()]
352
+ description = " ".join(most_common_words)
353
+
354
+ return description