risk-network 0.0.10__py3-none-any.whl → 0.0.12b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. risk/__init__.py +1 -1
  2. risk/risk.py +5 -6
  3. {risk_network-0.0.10.dist-info → risk_network-0.0.12b0.dist-info}/METADATA +11 -13
  4. risk_network-0.0.12b0.dist-info/RECORD +7 -0
  5. {risk_network-0.0.10.dist-info → risk_network-0.0.12b0.dist-info}/WHEEL +1 -1
  6. risk/annotations/__init__.py +0 -7
  7. risk/annotations/annotations.py +0 -394
  8. risk/annotations/io.py +0 -240
  9. risk/log/__init__.py +0 -11
  10. risk/log/console.py +0 -141
  11. risk/log/parameters.py +0 -172
  12. risk/neighborhoods/__init__.py +0 -8
  13. risk/neighborhoods/api.py +0 -442
  14. risk/neighborhoods/community.py +0 -412
  15. risk/neighborhoods/domains.py +0 -358
  16. risk/neighborhoods/neighborhoods.py +0 -508
  17. risk/network/__init__.py +0 -6
  18. risk/network/geometry.py +0 -150
  19. risk/network/graph/__init__.py +0 -6
  20. risk/network/graph/api.py +0 -200
  21. risk/network/graph/graph.py +0 -269
  22. risk/network/graph/summary.py +0 -254
  23. risk/network/io.py +0 -550
  24. risk/network/plotter/__init__.py +0 -6
  25. risk/network/plotter/api.py +0 -54
  26. risk/network/plotter/canvas.py +0 -291
  27. risk/network/plotter/contour.py +0 -330
  28. risk/network/plotter/labels.py +0 -924
  29. risk/network/plotter/network.py +0 -294
  30. risk/network/plotter/plotter.py +0 -143
  31. risk/network/plotter/utils/colors.py +0 -416
  32. risk/network/plotter/utils/layout.py +0 -94
  33. risk/stats/__init__.py +0 -15
  34. risk/stats/permutation/__init__.py +0 -6
  35. risk/stats/permutation/permutation.py +0 -237
  36. risk/stats/permutation/test_functions.py +0 -69
  37. risk/stats/significance.py +0 -166
  38. risk/stats/stat_tests.py +0 -267
  39. risk_network-0.0.10.dist-info/RECORD +0 -40
  40. {risk_network-0.0.10.dist-info → risk_network-0.0.12b0.dist-info/licenses}/LICENSE +0 -0
  41. {risk_network-0.0.10.dist-info → risk_network-0.0.12b0.dist-info}/top_level.txt +0 -0
risk/__init__.py CHANGED
@@ -7,4 +7,4 @@ RISK: Regional Inference of Significant Kinships
7
7
 
8
8
  from risk.risk import RISK
9
9
 
10
- __version__ = "0.0.10"
10
+ __version__ = "0.0.12-beta.0"
risk/risk.py CHANGED
@@ -3,13 +3,12 @@ risk/risk
3
3
  ~~~~~~~~~
4
4
  """
5
5
 
6
- from risk.network import NetworkIO
7
- from risk.annotations import AnnotationsIO
8
- from risk.neighborhoods import NeighborhoodsAPI
9
- from risk.network.graph import GraphAPI
10
- from risk.network.plotter import PlotterAPI
11
-
6
+ from risk.annotations.io import AnnotationsIO
12
7
  from risk.log import params, set_global_verbosity
8
+ from risk.neighborhoods.api import NeighborhoodsAPI
9
+ from risk.network.graph.api import GraphAPI
10
+ from risk.network.io import NetworkIO
11
+ from risk.network.plotter.api import PlotterAPI
13
12
 
14
13
 
15
14
  class RISK(NetworkIO, AnnotationsIO, NeighborhoodsAPI, GraphAPI, PlotterAPI):
@@ -1,8 +1,7 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: risk-network
3
- Version: 0.0.10
3
+ Version: 0.0.12b0
4
4
  Summary: A Python package for biological network analysis
5
- Author: Ira Horecka
6
5
  Author-email: Ira Horecka <ira89@icloud.com>
7
6
  License: GNU GENERAL PUBLIC LICENSE
8
7
  Version 3, 29 June 2007
@@ -699,7 +698,7 @@ Requires-Dist: leidenalg
699
698
  Requires-Dist: markov_clustering
700
699
  Requires-Dist: matplotlib
701
700
  Requires-Dist: networkx
702
- Requires-Dist: nltk==3.8.1
701
+ Requires-Dist: nltk
703
702
  Requires-Dist: numpy
704
703
  Requires-Dist: openpyxl
705
704
  Requires-Dist: pandas
@@ -710,8 +709,7 @@ Requires-Dist: scipy
710
709
  Requires-Dist: statsmodels
711
710
  Requires-Dist: threadpoolctl
712
711
  Requires-Dist: tqdm
713
- Dynamic: author
714
- Dynamic: requires-python
712
+ Dynamic: license-file
715
713
 
716
714
  # RISK Network
717
715
 
@@ -732,7 +730,10 @@ Dynamic: requires-python
732
730
 
733
731
  ## Documentation and Tutorial
734
732
 
735
- An interactive Jupyter notebook tutorial can be found [here](https://github.com/riskportal/network-tutorial). We highly recommend new users to consult the documentation and tutorial early on to fully utilize RISK's capabilities.
733
+ Full documentation is available at:
734
+
735
+ - **Docs:** [https://riskportal.github.io/network-tutorial](https://riskportal.github.io/network-tutorial)
736
+ - **Tutorial Jupyter Notebook Repository:** [https://github.com/riskportal/network-tutorial](https://github.com/riskportal/network-tutorial)
736
737
 
737
738
  ## Installation
738
739
 
@@ -748,7 +749,7 @@ pip install risk-network --upgrade
748
749
  - **Advanced Clustering Algorithms**: Supports Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap for identifying structured network regions.
749
750
  - **Flexible Visualization**: Produce customizable, high-resolution network visualizations with kernel density estimate overlays, adjustable node and edge attributes, and export options in SVG, PNG, and PDF formats.
750
751
  - **Efficient Data Handling**: Supports multiple input/output formats, including JSON, CSV, TSV, Excel, Cytoscape, and GPickle.
751
- - **Statistical Analysis**: Assess functional enrichment using hypergeometric, permutation, binomial, chi-squared, Poisson, and z-score tests, ensuring statistical adaptability across datasets.
752
+ - **Statistical Analysis**: Assess functional enrichment using hypergeometric, permutation (network-aware), binomial, chi-squared, Poisson, and z-score tests, ensuring statistical adaptability across datasets.
752
753
  - **Cross-Domain Applicability**: Suitable for network analysis across biological and non-biological domains, including social and communication networks.
753
754
 
754
755
  ## Example Usage
@@ -767,12 +768,13 @@ If you use RISK in your research, please cite:
767
768
 
768
769
  ## Software Architecture and Implementation
769
770
 
770
- RISK features a streamlined, modular architecture designed to meet diverse research needs. It includes dedicated modules for:
771
+ RISK features a streamlined, modular architecture designed to meet diverse research needs. RISK’s modular design enables users to run individual components—such as clustering, statistical testing, or visualization—independently or in combination, depending on the analysis workflow. It includes dedicated modules for:
771
772
 
772
773
  - **Data I/O**: Supports JSON, CSV, TSV, Excel, Cytoscape, and GPickle formats.
773
774
  - **Clustering**: Supports multiple clustering methods, including Louvain, Leiden, Markov Clustering, Greedy Modularity, Label Propagation, Spinglass, and Walktrap. Provides flexible distance metrics tailored to network structure.
774
775
  - **Statistical Analysis**: Provides a suite of tests for overrepresentation analysis of annotations.
775
776
  - **Visualization**: Offers customizable, high-resolution output in multiple formats, including SVG, PNG, and PDF.
777
+ - **Configuration Management**: Centralized parameters in risk.params ensure reproducibility and easy tuning for large-scale analyses.
776
778
 
777
779
  ## Performance and Efficiency
778
780
 
@@ -792,7 +794,3 @@ If you encounter issues or have suggestions for new features, please use the [Is
792
794
  ## License
793
795
 
794
796
  RISK is open source under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html).
795
-
796
- ---
797
-
798
- **Note**: For detailed documentation and to access the interactive tutorial, please visit the links above.
@@ -0,0 +1,7 @@
1
+ risk/__init__.py,sha256=e_WD-ImEb9HlOwCwNwl4j-NsOz0aX5quhrIlzDfXUUo,127
2
+ risk/risk.py,sha256=_Zs8cC4V0eqzfaMbq9M50ir815dbYS-oyTPlrySuMLw,1121
3
+ risk_network-0.0.12b0.dist-info/licenses/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
4
+ risk_network-0.0.12b0.dist-info/METADATA,sha256=kLPB0KWqefUPdVh8NBszDvNaVimnq7dK75mtxIxUsls,47216
5
+ risk_network-0.0.12b0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
6
+ risk_network-0.0.12b0.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
7
+ risk_network-0.0.12b0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (76.0.0)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,7 +0,0 @@
1
- """
2
- risk/annotations
3
- ~~~~~~~~~~~~~~~~
4
- """
5
-
6
- from risk.annotations.annotations import define_top_annotations, get_weighted_description
7
- from risk.annotations.io import AnnotationsIO
@@ -1,394 +0,0 @@
1
- """
2
- risk/annotations/annotations
3
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
- """
5
-
6
- import os
7
- import re
8
- import zipfile
9
- from collections import Counter
10
- from itertools import compress
11
- from typing import Any, Dict, List, Set
12
-
13
- import networkx as nx
14
- import nltk
15
- import numpy as np
16
- import pandas as pd
17
- from nltk.corpus import stopwords
18
- from nltk.stem import WordNetLemmatizer
19
- from nltk.tokenize import word_tokenize
20
-
21
- from risk.log import logger
22
- from scipy.sparse import coo_matrix
23
-
24
-
25
- def ensure_nltk_resource(resource: str) -> None:
26
- """Ensure the specified NLTK resource is available."""
27
- # Define the path to the resource within the NLTK data directory
28
- resource_path = f"corpora/{resource}"
29
- # Check if the resource is already available.
30
- try:
31
- nltk.data.find(resource_path)
32
- return
33
- except LookupError:
34
- print(f"Resource '{resource}' not found. Attempting to download...")
35
-
36
- # Download the resource.
37
- nltk.download(resource)
38
- # Check again after downloading.
39
- try:
40
- nltk.data.find(resource_path)
41
- return
42
- except LookupError:
43
- print(f"Resource '{resource}' still not found after download. Checking for a ZIP file...")
44
-
45
- # Look for a ZIP file in all known NLTK data directories.
46
- for data_path in nltk.data.path:
47
- zip_path = os.path.join(data_path, "corpora", f"{resource}.zip")
48
- if os.path.isfile(zip_path):
49
- print(f"Found ZIP file for '{resource}' at: {zip_path}")
50
- target_dir = os.path.join(data_path, "corpora")
51
- with zipfile.ZipFile(zip_path, "r") as z:
52
- z.extractall(path=target_dir)
53
- print(f"Unzipped '{resource}' successfully.")
54
- break # Stop after unzipping the first found ZIP.
55
-
56
- # Final check: Try to check resource one last time. If it fails, rai
57
- try:
58
- nltk.data.find(resource_path)
59
- print(f"Resource '{resource}' is now available.")
60
- except LookupError:
61
- raise LookupError(f"Resource '{resource}' could not be found, downloaded, or unzipped.")
62
-
63
-
64
- # Ensure the NLTK stopwords and WordNet resources are available
65
- # punkt is known to have issues with the default download method, so we use a custom function if it fails
66
- try:
67
- ensure_nltk_resource("punkt")
68
- except LookupError:
69
- nltk.download("punkt")
70
- ensure_nltk_resource("stopwords")
71
- ensure_nltk_resource("wordnet")
72
- # Use NLTK's stopwords - load all languages
73
- STOP_WORDS = set(word for lang in stopwords.fileids() for word in stopwords.words(lang))
74
- # Initialize the WordNet lemmatizer, which is used for normalizing words
75
- LEMMATIZER = WordNetLemmatizer()
76
-
77
-
78
- def load_annotations(
79
- network: nx.Graph, annotations_input: Dict[str, Any], min_nodes_per_term: int = 2
80
- ) -> Dict[str, Any]:
81
- """Convert annotations input to a sparse matrix and reindex based on the network's node labels.
82
-
83
- Args:
84
- network (nx.Graph): The network graph.
85
- annotations_input (Dict[str, Any]): A dictionary with annotations.
86
- min_nodes_per_term (int, optional): The minimum number of network nodes required for each annotation
87
- term to be included. Defaults to 2.
88
-
89
- Returns:
90
- Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the sparse binary annotations
91
- matrix.
92
-
93
- Raises:
94
- ValueError: If no annotations are found for the nodes in the network.
95
- ValueError: If no annotations have at least min_nodes_per_term nodes in the network.
96
- """
97
- # Step 1: Map nodes and annotations to indices
98
- node_label_order = [attr["label"] for _, attr in network.nodes(data=True) if "label" in attr]
99
- node_to_idx = {node: i for i, node in enumerate(node_label_order)}
100
- annotation_to_idx = {annotation: i for i, annotation in enumerate(annotations_input)}
101
- # Step 2: Construct a sparse binary matrix directly
102
- row = []
103
- col = []
104
- data = []
105
- for annotation, nodes in annotations_input.items():
106
- for node in nodes:
107
- if node in node_to_idx and annotation in annotation_to_idx:
108
- row.append(node_to_idx[node])
109
- col.append(annotation_to_idx[annotation])
110
- data.append(1)
111
-
112
- # Create a sparse binary matrix
113
- num_nodes = len(node_to_idx)
114
- num_annotations = len(annotation_to_idx)
115
- annotations_pivot = coo_matrix((data, (row, col)), shape=(num_nodes, num_annotations)).tocsr()
116
- # Step 3: Filter out annotations with fewer than min_nodes_per_term occurrences
117
- valid_annotations = annotations_pivot.sum(axis=0).A1 >= min_nodes_per_term
118
- annotations_pivot = annotations_pivot[:, valid_annotations]
119
- # Step 4: Raise errors for empty matrices
120
- if annotations_pivot.nnz == 0:
121
- raise ValueError("No terms found in the annotation file for the nodes in the network.")
122
-
123
- num_remaining_annotations = annotations_pivot.shape[1]
124
- if num_remaining_annotations == 0:
125
- raise ValueError(
126
- f"No annotation terms found with at least {min_nodes_per_term} nodes in the network."
127
- )
128
-
129
- # Step 5: Extract ordered nodes and annotations
130
- ordered_nodes = tuple(node_label_order)
131
- ordered_annotations = tuple(
132
- annotation for annotation, is_valid in zip(annotation_to_idx, valid_annotations) if is_valid
133
- )
134
-
135
- # Log the filtering details
136
- logger.info(f"Minimum number of nodes per annotation term: {min_nodes_per_term}")
137
- logger.info(f"Number of input annotation terms: {num_annotations}")
138
- logger.info(f"Number of remaining annotation terms: {num_remaining_annotations}")
139
-
140
- return {
141
- "ordered_nodes": ordered_nodes,
142
- "ordered_annotations": ordered_annotations,
143
- "matrix": annotations_pivot,
144
- }
145
-
146
-
147
- def define_top_annotations(
148
- network: nx.Graph,
149
- ordered_annotation_labels: List[str],
150
- neighborhood_significance_sums: List[int],
151
- significant_significance_matrix: np.ndarray,
152
- significant_binary_significance_matrix: np.ndarray,
153
- min_cluster_size: int = 5,
154
- max_cluster_size: int = 1000,
155
- ) -> pd.DataFrame:
156
- """Define top annotations based on neighborhood significance sums and binary significance matrix.
157
-
158
- Args:
159
- network (NetworkX graph): The network graph.
160
- ordered_annotation_labels (list of str): List of ordered annotation labels.
161
- neighborhood_significance_sums (list of int): List of neighborhood significance sums.
162
- significant_significance_matrix (np.ndarray): Enrichment matrix below alpha threshold.
163
- significant_binary_significance_matrix (np.ndarray): Binary significance matrix below alpha threshold.
164
- min_cluster_size (int, optional): Minimum cluster size. Defaults to 5.
165
- max_cluster_size (int, optional): Maximum cluster size. Defaults to 1000.
166
-
167
- Returns:
168
- pd.DataFrame: DataFrame with top annotations and their properties.
169
- """
170
- # Sum the columns of the significant significance matrix (positive floating point values)
171
- significant_significance_scores = significant_significance_matrix.sum(axis=0)
172
- # Create DataFrame to store annotations, their neighborhood significance sums, and significance scores
173
- annotations_significance_matrix = pd.DataFrame(
174
- {
175
- "id": range(len(ordered_annotation_labels)),
176
- "full_terms": ordered_annotation_labels,
177
- "significant_neighborhood_significance_sums": neighborhood_significance_sums,
178
- "significant_significance_score": significant_significance_scores,
179
- }
180
- )
181
- annotations_significance_matrix["significant_annotations"] = False
182
- # Apply size constraints to identify potential significant annotations
183
- annotations_significance_matrix.loc[
184
- (
185
- annotations_significance_matrix["significant_neighborhood_significance_sums"]
186
- >= min_cluster_size
187
- )
188
- & (
189
- annotations_significance_matrix["significant_neighborhood_significance_sums"]
190
- <= max_cluster_size
191
- ),
192
- "significant_annotations",
193
- ] = True
194
- # Initialize columns for connected components analysis
195
- annotations_significance_matrix["num_connected_components"] = 0
196
- annotations_significance_matrix["size_connected_components"] = None
197
- annotations_significance_matrix["size_connected_components"] = annotations_significance_matrix[
198
- "size_connected_components"
199
- ].astype(object)
200
- annotations_significance_matrix["num_large_connected_components"] = 0
201
-
202
- for attribute in annotations_significance_matrix.index.values[
203
- annotations_significance_matrix["significant_annotations"]
204
- ]:
205
- # Identify significant neighborhoods based on the binary significance matrix
206
- significant_neighborhoods = list(
207
- compress(list(network), significant_binary_significance_matrix[:, attribute])
208
- )
209
- significant_network = nx.subgraph(network, significant_neighborhoods)
210
- # Analyze connected components within the significant subnetwork
211
- connected_components = sorted(
212
- nx.connected_components(significant_network), key=len, reverse=True
213
- )
214
- size_connected_components = np.array([len(c) for c in connected_components])
215
-
216
- # Filter the size of connected components by min_cluster_size and max_cluster_size
217
- filtered_size_connected_components = size_connected_components[
218
- (size_connected_components >= min_cluster_size)
219
- & (size_connected_components <= max_cluster_size)
220
- ]
221
- # Calculate the number of connected components and large connected components
222
- num_connected_components = len(connected_components)
223
- num_large_connected_components = len(filtered_size_connected_components)
224
-
225
- # Assign the number of connected components
226
- annotations_significance_matrix.loc[attribute, "num_connected_components"] = (
227
- num_connected_components
228
- )
229
- # Filter out attributes with more than one connected component
230
- annotations_significance_matrix.loc[
231
- annotations_significance_matrix["num_connected_components"] > 1,
232
- "significant_annotations",
233
- ] = False
234
- # Assign the number of large connected components
235
- annotations_significance_matrix.loc[attribute, "num_large_connected_components"] = (
236
- num_large_connected_components
237
- )
238
- # Assign the size of connected components, ensuring it is always a list
239
- annotations_significance_matrix.at[attribute, "size_connected_components"] = (
240
- filtered_size_connected_components.tolist()
241
- )
242
-
243
- return annotations_significance_matrix
244
-
245
-
246
- def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
247
- """Generate a weighted description from words and their corresponding scores,
248
- using improved weighting logic with normalization, lemmatization, and aggregation.
249
-
250
- Args:
251
- words_column (pd.Series): A pandas Series containing strings (phrases) to process.
252
- scores_column (pd.Series): A pandas Series containing significance scores to weigh the terms.
253
-
254
- Returns:
255
- str: A coherent description formed from the most frequent and significant words.
256
- """
257
- # Normalize significance scores to [0,1]. If all scores are identical, use 1.
258
- if scores_column.max() == scores_column.min():
259
- normalized_scores = pd.Series([1] * len(scores_column), index=scores_column.index)
260
- else:
261
- normalized_scores = (scores_column - scores_column.min()) / (
262
- scores_column.max() - scores_column.min()
263
- )
264
-
265
- # Accumulate weighted counts for each token (after cleaning and lemmatization)
266
- weighted_counts = {}
267
- for phrase, score in zip(words_column, normalized_scores):
268
- # Tokenize the phrase
269
- tokens = word_tokenize(str(phrase))
270
- # Determine the weight (scale factor; here multiplying normalized score by 10)
271
- weight = max(1, int((0 if pd.isna(score) else score) * 10))
272
- for token in tokens:
273
- # Clean token: lowercase and remove extraneous punctuation (but preserve intra-word hyphens)
274
- token_clean = re.sub(r"[^\w\-]", "", token).strip()
275
- if not token_clean:
276
- continue
277
- # Skip tokens that are pure numbers
278
- if token_clean.isdigit():
279
- continue
280
- # Skip stopwords
281
- if token_clean in STOP_WORDS:
282
- continue
283
- # Lemmatize the token to merge similar forms
284
- token_norm = LEMMATIZER.lemmatize(token_clean)
285
- weighted_counts[token_norm] = weighted_counts.get(token_norm, 0) + weight
286
-
287
- # Reconstruct a weighted token list by repeating each token by its aggregated count.
288
- weighted_words = []
289
- for token, count in weighted_counts.items():
290
- weighted_words.extend([token] * count)
291
-
292
- # Combine tokens that match number-word patterns (e.g. "4-alpha") and remove pure numeric tokens.
293
- combined_tokens = []
294
- for token in weighted_words:
295
- if re.match(r"^\d+-\w+", token):
296
- combined_tokens.append(token)
297
- elif token.replace(".", "", 1).isdigit():
298
- continue
299
- else:
300
- combined_tokens.append(token)
301
-
302
- # If the only token is numeric, return a default value.
303
- if len(combined_tokens) == 1 and combined_tokens[0].isdigit():
304
- return "N/A"
305
-
306
- # Simplify the token list to remove near-duplicates based on the Jaccard index.
307
- simplified_words = _simplify_word_list(combined_tokens)
308
- # Generate a coherent description from the simplified words.
309
- description = _generate_coherent_description(simplified_words)
310
-
311
- return description
312
-
313
-
314
- def _simplify_word_list(words: List[str], threshold: float = 0.80) -> List[str]:
315
- """Filter out words that are too similar based on the Jaccard index,
316
- keeping the word with the higher aggregated count.
317
-
318
- Args:
319
- words (List[str]): The list of tokens to be filtered.
320
- threshold (float, optional): The similarity threshold for the Jaccard index. Defaults to 0.80.
321
-
322
- Returns:
323
- List[str]: A list of filtered words, where similar words are reduced to the most frequent one.
324
- """
325
- # Count the occurrences (which reflect the weighted importance)
326
- word_counts = Counter(words)
327
- filtered_words = []
328
- used_words = set()
329
-
330
- # Iterate through words sorted by descending weighted frequency
331
- for word in sorted(word_counts, key=lambda w: word_counts[w], reverse=True):
332
- if word in used_words:
333
- continue
334
-
335
- word_set = set(word)
336
- # Find similar words (including the current word) based on the Jaccard index
337
- similar_words = [
338
- other_word
339
- for other_word in word_counts
340
- if _calculate_jaccard_index(word_set, set(other_word)) >= threshold
341
- ]
342
- # Choose the word with the highest weighted count among the similar group
343
- similar_words.sort(key=lambda w: word_counts[w], reverse=True)
344
- best_word = similar_words[0]
345
- filtered_words.append(best_word)
346
- used_words.update(similar_words)
347
-
348
- # Preserve the original order (by frequency) from the filtered set
349
- final_words = [word for word in words if word in filtered_words]
350
-
351
- return final_words
352
-
353
-
354
- def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
355
- """Calculate the Jaccard index between two sets.
356
-
357
- Args:
358
- set1 (Set[Any]): The first set.
359
- set2 (Set[Any]): The second set.
360
-
361
- Returns:
362
- float: The Jaccard index (intersection over union). Returns 0 if the union is empty.
363
- """
364
- intersection = len(set1.intersection(set2))
365
- union = len(set1.union(set2))
366
- return intersection / union if union else 0
367
-
368
-
369
- def _generate_coherent_description(words: List[str]) -> str:
370
- """Generate a coherent description from a list of words.
371
-
372
- If there is only one unique entry, return it directly.
373
- Otherwise, order the words by frequency and join them into a single string.
374
-
375
- Args:
376
- words (List[str]): A list of tokens.
377
-
378
- Returns:
379
- str: A coherent, space-separated description.
380
- """
381
- if not words:
382
- return "N/A"
383
-
384
- # If there is only one unique word, return it directly
385
- unique_words = set(words)
386
- if len(unique_words) == 1:
387
- return list(unique_words)[0]
388
-
389
- # Count weighted occurrences and sort in descending order.
390
- word_counts = Counter(words)
391
- most_common_words = [word for word, _ in word_counts.most_common()]
392
- description = " ".join(most_common_words)
393
-
394
- return description