risk-network 0.0.8b18__py3-none-any.whl → 0.0.9b26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. risk/__init__.py +2 -2
  2. risk/annotations/__init__.py +2 -2
  3. risk/annotations/annotations.py +133 -72
  4. risk/annotations/io.py +50 -34
  5. risk/log/__init__.py +4 -2
  6. risk/log/{config.py → console.py} +5 -3
  7. risk/log/{params.py → parameters.py} +21 -46
  8. risk/neighborhoods/__init__.py +3 -5
  9. risk/neighborhoods/api.py +446 -0
  10. risk/neighborhoods/community.py +281 -96
  11. risk/neighborhoods/domains.py +92 -38
  12. risk/neighborhoods/neighborhoods.py +210 -149
  13. risk/network/__init__.py +1 -3
  14. risk/network/geometry.py +69 -58
  15. risk/network/graph/__init__.py +6 -0
  16. risk/network/graph/api.py +194 -0
  17. risk/network/graph/network.py +269 -0
  18. risk/network/graph/summary.py +254 -0
  19. risk/network/io.py +58 -48
  20. risk/network/plotter/__init__.py +6 -0
  21. risk/network/plotter/api.py +54 -0
  22. risk/network/{plot → plotter}/canvas.py +80 -26
  23. risk/network/{plot → plotter}/contour.py +43 -34
  24. risk/network/{plot → plotter}/labels.py +123 -113
  25. risk/network/plotter/network.py +424 -0
  26. risk/network/plotter/utils/colors.py +416 -0
  27. risk/network/plotter/utils/layout.py +94 -0
  28. risk/risk.py +11 -469
  29. risk/stats/__init__.py +8 -4
  30. risk/stats/binom.py +51 -0
  31. risk/stats/chi2.py +69 -0
  32. risk/stats/hypergeom.py +28 -18
  33. risk/stats/permutation/__init__.py +1 -1
  34. risk/stats/permutation/permutation.py +45 -39
  35. risk/stats/permutation/test_functions.py +25 -17
  36. risk/stats/poisson.py +17 -11
  37. risk/stats/stats.py +20 -16
  38. risk/stats/zscore.py +68 -0
  39. {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/METADATA +9 -5
  40. risk_network-0.0.9b26.dist-info/RECORD +44 -0
  41. {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/WHEEL +1 -1
  42. risk/network/graph.py +0 -159
  43. risk/network/plot/__init__.py +0 -6
  44. risk/network/plot/network.py +0 -282
  45. risk/network/plot/plotter.py +0 -137
  46. risk/network/plot/utils/color.py +0 -353
  47. risk/network/plot/utils/layout.py +0 -53
  48. risk_network-0.0.8b18.dist-info/RECORD +0 -37
  49. {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/LICENSE +0 -0
  50. {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/top_level.txt +0 -0
@@ -5,32 +5,32 @@ risk/neighborhoods/domains
5
5
 
6
6
  from contextlib import suppress
7
7
  from itertools import product
8
- from tqdm import tqdm
9
8
  from typing import Tuple
10
9
 
11
10
  import numpy as np
12
11
  import pandas as pd
13
12
  from scipy.cluster.hierarchy import linkage, fcluster
14
13
  from sklearn.metrics import silhouette_score
14
+ from tqdm import tqdm
15
15
 
16
- from risk.annotations import get_description
16
+ from risk.annotations import get_weighted_description
17
17
  from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
18
18
  from risk.log import logger
19
19
 
20
20
 
21
21
  def define_domains(
22
22
  top_annotations: pd.DataFrame,
23
- significant_neighborhoods_enrichment: np.ndarray,
23
+ significant_neighborhoods_significance: np.ndarray,
24
24
  linkage_criterion: str,
25
25
  linkage_method: str,
26
26
  linkage_metric: str,
27
27
  ) -> pd.DataFrame:
28
- """Define domains and assign nodes to these domains based on their enrichment scores and clustering,
28
+ """Define domains and assign nodes to these domains based on their significance scores and clustering,
29
29
  handling errors by assigning unique domains when clustering fails.
30
30
 
31
31
  Args:
32
32
  top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
33
- significant_neighborhoods_enrichment (np.ndarray): The binary enrichment matrix below alpha.
33
+ significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
34
34
  linkage_criterion (str): The clustering criterion for defining groups.
35
35
  linkage_method (str): The linkage method for clustering.
36
36
  linkage_metric (str): The linkage metric for clustering.
@@ -39,8 +39,14 @@ def define_domains(
39
39
  pd.DataFrame: DataFrame with the primary domain for each node.
40
40
  """
41
41
  try:
42
+ if linkage_criterion == "off":
43
+ raise ValueError("Clustering is turned off.")
44
+
42
45
  # Transpose the matrix to cluster annotations
43
- m = significant_neighborhoods_enrichment[:, top_annotations["top attributes"]].T
46
+ m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
47
+ # Safeguard the matrix by replacing NaN, Inf, and -Inf values
48
+ m = _safeguard_matrix(m)
49
+ # Optimize silhouette score across different linkage methods and distance metrics
44
50
  best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
45
51
  m, linkage_criterion, linkage_method, linkage_metric
46
52
  )
@@ -55,40 +61,49 @@ def define_domains(
55
61
  # Assign domains to the annotations matrix
56
62
  domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
57
63
  top_annotations["domain"] = 0
58
- top_annotations.loc[top_annotations["top attributes"], "domain"] = domains
64
+ top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
59
65
  except ValueError:
60
66
  # If a ValueError is encountered, handle it by assigning unique domains
61
67
  n_rows = len(top_annotations)
62
- logger.error(
63
- f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
64
- )
68
+ if linkage_criterion == "off":
69
+ logger.warning(
70
+ f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
71
+ )
72
+ else:
73
+ logger.error(
74
+ f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
75
+ )
65
76
  top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
66
77
 
67
78
  # Create DataFrames to store domain information
68
- node_to_enrichment = pd.DataFrame(
69
- data=significant_neighborhoods_enrichment,
79
+ node_to_significance = pd.DataFrame(
80
+ data=significant_neighborhoods_significance,
70
81
  columns=[top_annotations.index.values, top_annotations["domain"]],
71
82
  )
72
- node_to_domain = node_to_enrichment.groupby(level="domain", axis=1).sum()
83
+ node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
73
84
 
74
- # Find the maximum enrichment score for each node
85
+ # Find the maximum significance score for each node
75
86
  t_max = node_to_domain.loc[:, 1:].max(axis=1)
76
87
  t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
77
88
  t_idxmax[t_max == 0] = 0
78
89
 
90
+ # Assign all domains where the score is greater than 0
91
+ node_to_domain["all_domains"] = node_to_domain.loc[:, 1:].apply(
92
+ lambda row: list(row[row > 0].index), axis=1
93
+ )
79
94
  # Assign primary domain
80
- node_to_domain["primary domain"] = t_idxmax
95
+ node_to_domain["primary_domain"] = t_idxmax
81
96
 
82
97
  return node_to_domain
83
98
 
84
99
 
85
- def trim_domains_and_top_annotations(
100
+ def trim_domains(
86
101
  domains: pd.DataFrame,
87
102
  top_annotations: pd.DataFrame,
88
103
  min_cluster_size: int = 5,
89
104
  max_cluster_size: int = 1000,
90
105
  ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
91
- """Trim domains and top annotations that do not meet size criteria and find outliers.
106
+ """Trim domains that do not meet size criteria and find outliers.
92
107
 
93
108
  Args:
94
109
  domains (pd.DataFrame): DataFrame of domain data for the network nodes.
@@ -97,13 +112,12 @@ def trim_domains_and_top_annotations(
97
112
  max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
98
113
 
99
114
  Returns:
100
- tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing:
101
- - Trimmed annotations (pd.DataFrame)
115
+ Tuple[pd.DataFrame, pd.DataFrame]:
102
116
  - Trimmed domains (pd.DataFrame)
103
117
  - A DataFrame with domain labels (pd.DataFrame)
104
118
  """
105
119
  # Identify domains to remove based on size criteria
106
- domain_counts = domains["primary domain"].value_counts()
120
+ domain_counts = domains["primary_domain"].value_counts()
107
121
  to_remove = set(
108
122
  domain_counts[(domain_counts < min_cluster_size) | (domain_counts > max_cluster_size)].index
109
123
  )
@@ -112,34 +126,73 @@ def trim_domains_and_top_annotations(
112
126
  invalid_domain_id = 888888
113
127
  invalid_domain_ids = {0, invalid_domain_id}
114
128
  # Mark domains to be removed
115
- top_annotations["domain"].replace(to_remove, invalid_domain_id, inplace=True)
116
- domains.loc[domains["primary domain"].isin(to_remove), ["primary domain"]] = invalid_domain_id
129
+ top_annotations["domain"] = top_annotations["domain"].replace(to_remove, invalid_domain_id)
130
+ domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
117
131
 
118
- # Normalize "num enriched neighborhoods" by percentile for each domain and scale to 0-10
132
+ # Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
119
133
  top_annotations["normalized_value"] = top_annotations.groupby("domain")[
120
- "neighborhood enrichment sums"
134
+ "significant_neighborhood_significance_sums"
121
135
  ].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
122
- # Multiply 'words' column by normalized values
123
- top_annotations["words"] = top_annotations.apply(
124
- lambda row: " ".join([str(row["words"])] * row["normalized_value"]), axis=1
136
+ # Modify the lambda function to pass both full_terms and significant_significance_score
137
+ top_annotations["combined_terms"] = top_annotations.apply(
138
+ lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
125
139
  )
126
140
 
127
- # Generate domain labels
128
- domain_labels = top_annotations.groupby("domain")["words"].apply(get_description).reset_index()
141
+ # Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
142
+ domain_labels = (
143
+ top_annotations.groupby("domain")
144
+ .agg(
145
+ full_terms=("full_terms", lambda x: list(x)),
146
+ significance_scores=("significant_significance_score", lambda x: list(x)),
147
+ )
148
+ .reset_index()
149
+ )
150
+ domain_labels["combined_terms"] = domain_labels.apply(
151
+ lambda row: get_weighted_description(
152
+ pd.Series(row["full_terms"]), pd.Series(row["significance_scores"])
153
+ ),
154
+ axis=1,
155
+ )
156
+
157
+ # Rename the columns as necessary
129
158
  trimmed_domains_matrix = domain_labels.rename(
130
- columns={"domain": "id", "words": "label"}
159
+ columns={
160
+ "domain": "id",
161
+ "combined_terms": "normalized_description",
162
+ "full_terms": "full_descriptions",
163
+ "significance_scores": "significance_scores",
164
+ }
131
165
  ).set_index("id")
132
166
 
133
167
  # Remove invalid domains
134
- valid_annotations = top_annotations[~top_annotations["domain"].isin(invalid_domain_ids)].drop(
135
- columns=["normalized_value"]
136
- )
137
- valid_domains = domains[~domains["primary domain"].isin(invalid_domain_ids)]
168
+ valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
138
169
  valid_trimmed_domains_matrix = trimmed_domains_matrix[
139
170
  ~trimmed_domains_matrix.index.isin(invalid_domain_ids)
140
171
  ]
172
+ return valid_domains, valid_trimmed_domains_matrix
173
+
174
+
175
+ def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
176
+ """Safeguard the matrix by replacing NaN, Inf, and -Inf values.
141
177
 
142
- return valid_annotations, valid_domains, valid_trimmed_domains_matrix
178
+ Args:
179
+ matrix (np.ndarray): Data matrix.
180
+
181
+ Returns:
182
+ np.ndarray: Safeguarded data matrix.
183
+ """
184
+ # Replace NaN with column mean
185
+ nan_replacement = np.nanmean(matrix, axis=0)
186
+ matrix = np.where(np.isnan(matrix), nan_replacement, matrix)
187
+ # Replace Inf/-Inf with maximum/minimum finite values
188
+ finite_max = np.nanmax(matrix[np.isfinite(matrix)])
189
+ finite_min = np.nanmin(matrix[np.isfinite(matrix)])
190
+ matrix = np.where(np.isposinf(matrix), finite_max, matrix)
191
+ matrix = np.where(np.isneginf(matrix), finite_min, matrix)
192
+ # Ensure rows have non-zero variance (optional step)
193
+ row_variance = np.var(matrix, axis=1)
194
+ matrix = matrix[row_variance > 0]
195
+ return matrix
143
196
 
144
197
 
145
198
  def _optimize_silhouette_across_linkage_and_metrics(
@@ -154,7 +207,7 @@ def _optimize_silhouette_across_linkage_and_metrics(
154
207
  linkage_metric (str): Linkage metric for clustering.
155
208
 
156
209
  Returns:
157
- tuple[str, str, float]: A tuple containing:
210
+ Tuple[str, str, float]:
158
211
  - Best linkage method (str)
159
212
  - Best linkage metric (str)
160
213
  - Best threshold (float)
@@ -175,7 +228,8 @@ def _optimize_silhouette_across_linkage_and_metrics(
175
228
  total=total_combinations,
176
229
  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
177
230
  ):
178
- with suppress(Exception):
231
+ # Some linkage methods and metrics may not work with certain data
232
+ with suppress(ValueError):
179
233
  Z = linkage(m, method=method, metric=metric)
180
234
  threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
181
235
  if score > best_overall_score:
@@ -208,7 +262,7 @@ def _find_best_silhouette_score(
208
262
  resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
209
263
 
210
264
  Returns:
211
- tuple[float, float]: A tuple containing:
265
+ Tuple[float, float]:
212
266
  - Best threshold (float): The threshold that yields the best silhouette score.
213
267
  - Best silhouette score (float): The highest silhouette score achieved.
214
268
  """