risk-network 0.0.8b27__py3-none-any.whl → 0.0.9b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +1 -1
- risk/annotations/annotations.py +39 -38
- risk/annotations/io.py +8 -6
- risk/log/__init__.py +3 -1
- risk/log/{params.py → parameters.py} +9 -34
- risk/neighborhoods/domains.py +18 -18
- risk/neighborhoods/neighborhoods.py +104 -92
- risk/network/graph/__init__.py +6 -0
- risk/network/{graph.py → graph/network.py} +38 -27
- risk/network/graph/summary.py +239 -0
- risk/network/io.py +3 -3
- risk/network/plot/contour.py +1 -1
- risk/network/plot/labels.py +1 -1
- risk/network/plot/network.py +28 -28
- risk/network/plot/utils/color.py +27 -27
- risk/risk.py +25 -30
- risk/stats/stats.py +13 -13
- {risk_network-0.0.8b27.dist-info → risk_network-0.0.9b2.dist-info}/METADATA +1 -1
- risk_network-0.0.9b2.dist-info/RECORD +39 -0
- risk_network-0.0.8b27.dist-info/RECORD +0 -37
- {risk_network-0.0.8b27.dist-info → risk_network-0.0.9b2.dist-info}/LICENSE +0 -0
- {risk_network-0.0.8b27.dist-info → risk_network-0.0.9b2.dist-info}/WHEEL +0 -0
- {risk_network-0.0.8b27.dist-info → risk_network-0.0.9b2.dist-info}/top_level.txt +0 -0
risk/__init__.py
CHANGED
risk/annotations/annotations.py
CHANGED
@@ -83,69 +83,69 @@ def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Di
|
|
83
83
|
def define_top_annotations(
|
84
84
|
network: nx.Graph,
|
85
85
|
ordered_annotation_labels: List[str],
|
86
|
-
|
87
|
-
|
88
|
-
|
86
|
+
neighborhood_significance_sums: List[int],
|
87
|
+
significant_significance_matrix: np.ndarray,
|
88
|
+
significant_binary_significance_matrix: np.ndarray,
|
89
89
|
min_cluster_size: int = 5,
|
90
90
|
max_cluster_size: int = 1000,
|
91
91
|
) -> pd.DataFrame:
|
92
|
-
"""Define top annotations based on neighborhood
|
92
|
+
"""Define top annotations based on neighborhood significance sums and binary significance matrix.
|
93
93
|
|
94
94
|
Args:
|
95
95
|
network (NetworkX graph): The network graph.
|
96
96
|
ordered_annotation_labels (list of str): List of ordered annotation labels.
|
97
|
-
|
98
|
-
|
99
|
-
|
97
|
+
neighborhood_significance_sums (list of int): List of neighborhood significance sums.
|
98
|
+
significant_significance_matrix (np.ndarray): Enrichment matrix below alpha threshold.
|
99
|
+
significant_binary_significance_matrix (np.ndarray): Binary significance matrix below alpha threshold.
|
100
100
|
min_cluster_size (int, optional): Minimum cluster size. Defaults to 5.
|
101
101
|
max_cluster_size (int, optional): Maximum cluster size. Defaults to 1000.
|
102
102
|
|
103
103
|
Returns:
|
104
104
|
pd.DataFrame: DataFrame with top annotations and their properties.
|
105
105
|
"""
|
106
|
-
# Sum the columns of the significant
|
107
|
-
|
108
|
-
# Create DataFrame to store annotations, their neighborhood
|
109
|
-
|
106
|
+
# Sum the columns of the significant significance matrix (positive floating point values)
|
107
|
+
significant_significance_scores = significant_significance_matrix.sum(axis=0)
|
108
|
+
# Create DataFrame to store annotations, their neighborhood significance sums, and significance scores
|
109
|
+
annotations_significance_matrix = pd.DataFrame(
|
110
110
|
{
|
111
111
|
"id": range(len(ordered_annotation_labels)),
|
112
112
|
"full_terms": ordered_annotation_labels,
|
113
|
-
"
|
114
|
-
"
|
113
|
+
"significant_neighborhood_significance_sums": neighborhood_significance_sums,
|
114
|
+
"significant_significance_score": significant_significance_scores,
|
115
115
|
}
|
116
116
|
)
|
117
|
-
|
117
|
+
annotations_significance_matrix["significant_annotations"] = False
|
118
118
|
# Apply size constraints to identify potential significant annotations
|
119
|
-
|
119
|
+
annotations_significance_matrix.loc[
|
120
120
|
(
|
121
|
-
|
121
|
+
annotations_significance_matrix["significant_neighborhood_significance_sums"]
|
122
122
|
>= min_cluster_size
|
123
123
|
)
|
124
124
|
& (
|
125
|
-
|
125
|
+
annotations_significance_matrix["significant_neighborhood_significance_sums"]
|
126
126
|
<= max_cluster_size
|
127
127
|
),
|
128
128
|
"significant_annotations",
|
129
129
|
] = True
|
130
130
|
# Initialize columns for connected components analysis
|
131
|
-
|
132
|
-
|
133
|
-
|
131
|
+
annotations_significance_matrix["num_connected_components"] = 0
|
132
|
+
annotations_significance_matrix["size_connected_components"] = None
|
133
|
+
annotations_significance_matrix["size_connected_components"] = annotations_significance_matrix[
|
134
134
|
"size_connected_components"
|
135
135
|
].astype(object)
|
136
|
-
|
136
|
+
annotations_significance_matrix["num_large_connected_components"] = 0
|
137
137
|
|
138
|
-
for attribute in
|
139
|
-
|
138
|
+
for attribute in annotations_significance_matrix.index.values[
|
139
|
+
annotations_significance_matrix["significant_annotations"]
|
140
140
|
]:
|
141
|
-
# Identify
|
142
|
-
|
143
|
-
compress(list(network),
|
141
|
+
# Identify significant neighborhoods based on the binary significance matrix
|
142
|
+
significant_neighborhoods = list(
|
143
|
+
compress(list(network), significant_binary_significance_matrix[:, attribute])
|
144
144
|
)
|
145
|
-
|
146
|
-
# Analyze connected components within the
|
145
|
+
significant_network = nx.subgraph(network, significant_neighborhoods)
|
146
|
+
# Analyze connected components within the significant subnetwork
|
147
147
|
connected_components = sorted(
|
148
|
-
nx.connected_components(
|
148
|
+
nx.connected_components(significant_network), key=len, reverse=True
|
149
149
|
)
|
150
150
|
size_connected_components = np.array([len(c) for c in connected_components])
|
151
151
|
|
@@ -159,23 +159,24 @@ def define_top_annotations(
|
|
159
159
|
num_large_connected_components = len(filtered_size_connected_components)
|
160
160
|
|
161
161
|
# Assign the number of connected components
|
162
|
-
|
162
|
+
annotations_significance_matrix.loc[attribute, "num_connected_components"] = (
|
163
163
|
num_connected_components
|
164
164
|
)
|
165
165
|
# Filter out attributes with more than one connected component
|
166
|
-
|
167
|
-
|
166
|
+
annotations_significance_matrix.loc[
|
167
|
+
annotations_significance_matrix["num_connected_components"] > 1,
|
168
|
+
"significant_annotations",
|
168
169
|
] = False
|
169
170
|
# Assign the number of large connected components
|
170
|
-
|
171
|
+
annotations_significance_matrix.loc[attribute, "num_large_connected_components"] = (
|
171
172
|
num_large_connected_components
|
172
173
|
)
|
173
174
|
# Assign the size of connected components, ensuring it is always a list
|
174
|
-
|
175
|
+
annotations_significance_matrix.at[attribute, "size_connected_components"] = (
|
175
176
|
filtered_size_connected_components.tolist()
|
176
177
|
)
|
177
178
|
|
178
|
-
return
|
179
|
+
return annotations_significance_matrix
|
179
180
|
|
180
181
|
|
181
182
|
def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
|
@@ -184,16 +185,16 @@ def get_weighted_description(words_column: pd.Series, scores_column: pd.Series)
|
|
184
185
|
|
185
186
|
Args:
|
186
187
|
words_column (pd.Series): A pandas Series containing strings to process.
|
187
|
-
scores_column (pd.Series): A pandas Series containing
|
188
|
+
scores_column (pd.Series): A pandas Series containing significance scores to weigh the terms.
|
188
189
|
|
189
190
|
Returns:
|
190
|
-
str: A coherent description formed from the most frequent and significant words, weighed by
|
191
|
+
str: A coherent description formed from the most frequent and significant words, weighed by significance scores.
|
191
192
|
"""
|
192
193
|
# Handle case where all scores are the same
|
193
194
|
if scores_column.max() == scores_column.min():
|
194
195
|
normalized_scores = pd.Series([1] * len(scores_column))
|
195
196
|
else:
|
196
|
-
# Normalize the
|
197
|
+
# Normalize the significance scores to be between 0 and 1
|
197
198
|
normalized_scores = (scores_column - scores_column.min()) / (
|
198
199
|
scores_column.max() - scores_column.min()
|
199
200
|
)
|
risk/annotations/io.py
CHANGED
@@ -76,11 +76,13 @@ class AnnotationsIO:
|
|
76
76
|
_log_loading(filetype, filepath=filepath)
|
77
77
|
|
78
78
|
# Load the specified sheet from the Excel file
|
79
|
-
|
79
|
+
annotation = pd.read_excel(filepath, sheet_name=sheet_name)
|
80
80
|
# Split the nodes column by the specified nodes_delimiter
|
81
|
-
|
81
|
+
annotation[nodes_colname] = annotation[nodes_colname].apply(
|
82
|
+
lambda x: x.split(nodes_delimiter)
|
83
|
+
)
|
82
84
|
# Convert the DataFrame to a dictionary pairing labels with their corresponding nodes
|
83
|
-
label_node_dict =
|
85
|
+
label_node_dict = annotation.set_index(label_colname)[nodes_colname].to_dict()
|
84
86
|
|
85
87
|
# Load the annotations into the provided network
|
86
88
|
return load_annotations(network, label_node_dict)
|
@@ -203,11 +205,11 @@ def _load_matrix_file(
|
|
203
205
|
Dict[str, Any]: A dictionary where each label is paired with its respective list of nodes.
|
204
206
|
"""
|
205
207
|
# Load the CSV or TSV file into a DataFrame
|
206
|
-
|
208
|
+
annotation = pd.read_csv(filepath, delimiter=delimiter)
|
207
209
|
# Split the nodes column by the nodes_delimiter to handle multiple nodes per label
|
208
|
-
|
210
|
+
annotation[nodes_colname] = annotation[nodes_colname].apply(lambda x: x.split(nodes_delimiter))
|
209
211
|
# Create a dictionary pairing labels with their corresponding list of nodes
|
210
|
-
label_node_dict =
|
212
|
+
label_node_dict = annotation.set_index(label_colname)[nodes_colname].to_dict()
|
211
213
|
return label_node_dict
|
212
214
|
|
213
215
|
|
risk/log/__init__.py
CHANGED
@@ -1,50 +1,22 @@
|
|
1
1
|
"""
|
2
|
-
risk/log/
|
3
|
-
|
2
|
+
risk/log/parameters
|
3
|
+
~~~~~~~~~~~~~~~~~~~
|
4
4
|
"""
|
5
5
|
|
6
6
|
import csv
|
7
7
|
import json
|
8
8
|
import warnings
|
9
9
|
from datetime import datetime
|
10
|
-
from functools import wraps
|
11
10
|
from typing import Any, Dict
|
12
11
|
|
13
12
|
import numpy as np
|
14
13
|
|
15
|
-
from .console import logger, log_header
|
14
|
+
from risk.log.console import logger, log_header
|
16
15
|
|
17
16
|
# Suppress all warnings - this is to resolve warnings from multiprocessing
|
18
17
|
warnings.filterwarnings("ignore")
|
19
18
|
|
20
19
|
|
21
|
-
def _safe_param_export(func):
|
22
|
-
"""A decorator to wrap parameter export functions in a try-except block for safe execution.
|
23
|
-
|
24
|
-
Args:
|
25
|
-
func (function): The function to be wrapped.
|
26
|
-
|
27
|
-
Returns:
|
28
|
-
function: The wrapped function with error handling.
|
29
|
-
"""
|
30
|
-
|
31
|
-
@wraps(func)
|
32
|
-
def wrapper(*args, **kwargs):
|
33
|
-
try:
|
34
|
-
result = func(*args, **kwargs)
|
35
|
-
filepath = (
|
36
|
-
kwargs.get("filepath") or args[1]
|
37
|
-
) # Assuming filepath is always the second argument
|
38
|
-
logger.info(f"Parameters successfully exported to filepath: {filepath}")
|
39
|
-
return result
|
40
|
-
except Exception as e:
|
41
|
-
filepath = kwargs.get("filepath") or args[1]
|
42
|
-
logger.error(f"An error occurred while exporting parameters to {filepath}: {e}")
|
43
|
-
return None
|
44
|
-
|
45
|
-
return wrapper
|
46
|
-
|
47
|
-
|
48
20
|
class Params:
|
49
21
|
"""Handles the storage and logging of various parameters for network analysis.
|
50
22
|
|
@@ -106,7 +78,6 @@ class Params:
|
|
106
78
|
"""
|
107
79
|
self.plotter = {**self.plotter, **kwargs}
|
108
80
|
|
109
|
-
@_safe_param_export
|
110
81
|
def to_csv(self, filepath: str) -> None:
|
111
82
|
"""Export the parameters to a CSV file.
|
112
83
|
|
@@ -128,7 +99,8 @@ class Params:
|
|
128
99
|
else:
|
129
100
|
writer.writerow([parent_key, "", parent_value])
|
130
101
|
|
131
|
-
|
102
|
+
logger.info(f"Parameters exported to CSV file: {filepath}")
|
103
|
+
|
132
104
|
def to_json(self, filepath: str) -> None:
|
133
105
|
"""Export the parameters to a JSON file.
|
134
106
|
|
@@ -138,7 +110,8 @@ class Params:
|
|
138
110
|
with open(filepath, "w") as json_file:
|
139
111
|
json.dump(self.load(), json_file, indent=4)
|
140
112
|
|
141
|
-
|
113
|
+
logger.info(f"Parameters exported to JSON file: {filepath}")
|
114
|
+
|
142
115
|
def to_txt(self, filepath: str) -> None:
|
143
116
|
"""Export the parameters to a text file.
|
144
117
|
|
@@ -155,6 +128,8 @@ class Params:
|
|
155
128
|
# Add a blank line after each entry
|
156
129
|
txt_file.write("\n")
|
157
130
|
|
131
|
+
logger.info(f"Parameters exported to text file: {filepath}")
|
132
|
+
|
158
133
|
def load(self) -> Dict[str, Any]:
|
159
134
|
"""Load and process various parameters, converting any np.ndarray values to lists.
|
160
135
|
|
risk/neighborhoods/domains.py
CHANGED
@@ -20,17 +20,17 @@ from risk.log import logger
|
|
20
20
|
|
21
21
|
def define_domains(
|
22
22
|
top_annotations: pd.DataFrame,
|
23
|
-
|
23
|
+
significant_neighborhoods_significance: np.ndarray,
|
24
24
|
linkage_criterion: str,
|
25
25
|
linkage_method: str,
|
26
26
|
linkage_metric: str,
|
27
27
|
) -> pd.DataFrame:
|
28
|
-
"""Define domains and assign nodes to these domains based on their
|
28
|
+
"""Define domains and assign nodes to these domains based on their significance scores and clustering,
|
29
29
|
handling errors by assigning unique domains when clustering fails.
|
30
30
|
|
31
31
|
Args:
|
32
32
|
top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
33
|
-
|
33
|
+
significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
|
34
34
|
linkage_criterion (str): The clustering criterion for defining groups.
|
35
35
|
linkage_method (str): The linkage method for clustering.
|
36
36
|
linkage_metric (str): The linkage metric for clustering.
|
@@ -40,7 +40,7 @@ def define_domains(
|
|
40
40
|
"""
|
41
41
|
try:
|
42
42
|
# Transpose the matrix to cluster annotations
|
43
|
-
m =
|
43
|
+
m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
|
44
44
|
best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
|
45
45
|
m, linkage_criterion, linkage_method, linkage_metric
|
46
46
|
)
|
@@ -65,13 +65,13 @@ def define_domains(
|
|
65
65
|
top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
|
66
66
|
|
67
67
|
# Create DataFrames to store domain information
|
68
|
-
|
69
|
-
data=
|
68
|
+
node_to_significance = pd.DataFrame(
|
69
|
+
data=significant_neighborhoods_significance,
|
70
70
|
columns=[top_annotations.index.values, top_annotations["domain"]],
|
71
71
|
)
|
72
|
-
node_to_domain =
|
72
|
+
node_to_domain = node_to_significance.groupby(level="domain", axis=1).sum()
|
73
73
|
|
74
|
-
# Find the maximum
|
74
|
+
# Find the maximum significance score for each node
|
75
75
|
t_max = node_to_domain.loc[:, 1:].max(axis=1)
|
76
76
|
t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
|
77
77
|
t_idxmax[t_max == 0] = 0
|
@@ -101,7 +101,7 @@ def trim_domains_and_top_annotations(
|
|
101
101
|
max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
|
102
102
|
|
103
103
|
Returns:
|
104
|
-
Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
104
|
+
Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
105
105
|
- Trimmed annotations (pd.DataFrame)
|
106
106
|
- Trimmed domains (pd.DataFrame)
|
107
107
|
- A DataFrame with domain labels (pd.DataFrame)
|
@@ -119,27 +119,27 @@ def trim_domains_and_top_annotations(
|
|
119
119
|
top_annotations["domain"].replace(to_remove, invalid_domain_id, inplace=True)
|
120
120
|
domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
|
121
121
|
|
122
|
-
# Normalize "num
|
122
|
+
# Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
|
123
123
|
top_annotations["normalized_value"] = top_annotations.groupby("domain")[
|
124
|
-
"
|
124
|
+
"significant_neighborhood_significance_sums"
|
125
125
|
].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
|
126
|
-
# Modify the lambda function to pass both full_terms and
|
126
|
+
# Modify the lambda function to pass both full_terms and significant_significance_score
|
127
127
|
top_annotations["combined_terms"] = top_annotations.apply(
|
128
128
|
lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
|
129
129
|
)
|
130
130
|
|
131
|
-
# Perform the groupby operation while retaining the other columns and adding the weighting with
|
131
|
+
# Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
|
132
132
|
domain_labels = (
|
133
133
|
top_annotations.groupby("domain")
|
134
134
|
.agg(
|
135
135
|
full_terms=("full_terms", lambda x: list(x)),
|
136
|
-
|
136
|
+
significance_scores=("significant_significance_score", lambda x: list(x)),
|
137
137
|
)
|
138
138
|
.reset_index()
|
139
139
|
)
|
140
140
|
domain_labels["combined_terms"] = domain_labels.apply(
|
141
141
|
lambda row: get_weighted_description(
|
142
|
-
pd.Series(row["full_terms"]), pd.Series(row["
|
142
|
+
pd.Series(row["full_terms"]), pd.Series(row["significance_scores"])
|
143
143
|
),
|
144
144
|
axis=1,
|
145
145
|
)
|
@@ -150,7 +150,7 @@ def trim_domains_and_top_annotations(
|
|
150
150
|
"domain": "id",
|
151
151
|
"combined_terms": "normalized_description",
|
152
152
|
"full_terms": "full_descriptions",
|
153
|
-
"
|
153
|
+
"significance_scores": "significance_scores",
|
154
154
|
}
|
155
155
|
).set_index("id")
|
156
156
|
|
@@ -177,7 +177,7 @@ def _optimize_silhouette_across_linkage_and_metrics(
|
|
177
177
|
linkage_metric (str): Linkage metric for clustering.
|
178
178
|
|
179
179
|
Returns:
|
180
|
-
Tuple[str, str, float]:
|
180
|
+
Tuple[str, str, float]:
|
181
181
|
- Best linkage method (str)
|
182
182
|
- Best linkage metric (str)
|
183
183
|
- Best threshold (float)
|
@@ -231,7 +231,7 @@ def _find_best_silhouette_score(
|
|
231
231
|
resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
|
232
232
|
|
233
233
|
Returns:
|
234
|
-
Tuple[float, float]:
|
234
|
+
Tuple[float, float]:
|
235
235
|
- Best threshold (float): The threshold that yields the best silhouette score.
|
236
236
|
- Best silhouette score (float): The highest silhouette score achieved.
|
237
237
|
"""
|