risk-network 0.0.6b9__py3-none-any.whl → 0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
risk/risk.py CHANGED
@@ -10,7 +10,7 @@ import numpy as np
10
10
  import pandas as pd
11
11
 
12
12
  from risk.annotations import AnnotationsIO, define_top_annotations
13
- from risk.log import params, print_header
13
+ from risk.log import params, logger, log_header, set_global_verbosity
14
14
  from risk.neighborhoods import (
15
15
  define_domains,
16
16
  get_network_neighborhoods,
@@ -20,9 +20,9 @@ from risk.neighborhoods import (
20
20
  from risk.network import NetworkIO, NetworkGraph, NetworkPlotter
21
21
  from risk.stats import (
22
22
  calculate_significance_matrices,
23
- compute_fisher_exact_test,
24
23
  compute_hypergeom_test,
25
24
  compute_permutation_test,
25
+ compute_poisson_test,
26
26
  )
27
27
 
28
28
 
@@ -33,60 +33,60 @@ class RISK(NetworkIO, AnnotationsIO):
33
33
  and performing network-based statistical analysis, such as neighborhood significance testing.
34
34
  """
35
35
 
36
- def __init__(self, *args, **kwargs):
37
- """Initialize the RISK class with configuration settings."""
36
+ def __init__(self, verbose: bool = True):
37
+ """Initialize the RISK class with configuration settings.
38
+
39
+ Args:
40
+ verbose (bool): If False, suppresses all log messages to the console. Defaults to True.
41
+ """
42
+ # Set global verbosity for logging
43
+ set_global_verbosity(verbose)
38
44
  # Initialize and log network parameters
39
45
  params.initialize()
40
- # Initialize the parent classes
41
- super().__init__(*args, **kwargs)
46
+ super().__init__()
42
47
 
43
48
  @property
44
- def params(self):
45
- """Access the logged parameters."""
49
+ def params(self) -> params:
50
+ """Access the logged network parameters.
51
+
52
+ Returns:
53
+ Params: An instance of the Params class with logged parameters and methods to access or update them.
54
+ """
46
55
  return params
47
56
 
48
- def load_neighborhoods_by_permutation(
57
+ def load_neighborhoods_by_hypergeom(
49
58
  self,
50
59
  network: nx.Graph,
51
60
  annotations: Dict[str, Any],
52
- distance_metric: str = "dijkstra",
61
+ distance_metric: str = "louvain",
53
62
  louvain_resolution: float = 0.1,
54
63
  edge_length_threshold: float = 0.5,
55
- score_metric: str = "sum",
56
64
  null_distribution: str = "network",
57
- num_permutations: int = 1000,
58
65
  random_seed: int = 888,
59
- max_workers: int = 1,
60
66
  ) -> Dict[str, Any]:
61
- """Load significant neighborhoods for the network using the permutation test.
67
+ """Load significant neighborhoods for the network using the hypergeometric test.
62
68
 
63
69
  Args:
64
70
  network (nx.Graph): The network graph.
65
- annotations (pd.DataFrame): The matrix of annotations associated with the network.
66
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
71
+ annotations (dict): The annotations associated with the network.
72
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
67
73
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
68
74
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
69
- score_metric (str, optional): Scoring metric for neighborhood significance. Defaults to "sum".
70
- null_distribution (str, optional): Distribution used for permutation tests. Defaults to "network".
71
- num_permutations (int, optional): Number of permutations for significance testing. Defaults to 1000.
75
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
72
76
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
73
- max_workers (int, optional): Maximum number of workers for parallel computation. Defaults to 1.
74
77
 
75
78
  Returns:
76
79
  dict: Computed significance of neighborhoods.
77
80
  """
78
- print_header("Running permutation test")
81
+ log_header("Running hypergeometric test")
79
82
  # Log neighborhood analysis parameters
80
83
  params.log_neighborhoods(
81
84
  distance_metric=distance_metric,
82
85
  louvain_resolution=louvain_resolution,
83
86
  edge_length_threshold=edge_length_threshold,
84
- statistical_test_function="permutation",
85
- score_metric=score_metric,
87
+ statistical_test_function="hypergeom",
86
88
  null_distribution=null_distribution,
87
- num_permutations=num_permutations,
88
89
  random_seed=random_seed,
89
- max_workers=max_workers,
90
90
  )
91
91
 
92
92
  # Load neighborhoods based on the network and distance metric
@@ -97,59 +97,49 @@ class RISK(NetworkIO, AnnotationsIO):
97
97
  edge_length_threshold=edge_length_threshold,
98
98
  random_seed=random_seed,
99
99
  )
100
-
101
- # Log and display permutation test settings
102
- print(f"Neighborhood scoring metric: '{score_metric}'")
103
- print(f"Null distribution: '{null_distribution}'")
104
- print(f"Number of permutations: {num_permutations}")
105
- print(f"Maximum workers: {max_workers}")
106
- # Run permutation test to compute neighborhood significance
107
- neighborhood_significance = compute_permutation_test(
100
+ # Run hypergeometric test to compute neighborhood significance
101
+ neighborhood_significance = compute_hypergeom_test(
108
102
  neighborhoods=neighborhoods,
109
103
  annotations=annotations["matrix"],
110
- score_metric=score_metric,
111
104
  null_distribution=null_distribution,
112
- num_permutations=num_permutations,
113
- random_seed=random_seed,
114
- max_workers=max_workers,
115
105
  )
116
106
 
117
107
  # Return the computed neighborhood significance
118
108
  return neighborhood_significance
119
109
 
120
- def load_neighborhoods_by_fisher_exact(
110
+ def load_neighborhoods_by_poisson(
121
111
  self,
122
112
  network: nx.Graph,
123
113
  annotations: Dict[str, Any],
124
- distance_metric: str = "dijkstra",
114
+ distance_metric: str = "louvain",
125
115
  louvain_resolution: float = 0.1,
126
116
  edge_length_threshold: float = 0.5,
117
+ null_distribution: str = "network",
127
118
  random_seed: int = 888,
128
- max_workers: int = 1,
129
119
  ) -> Dict[str, Any]:
130
- """Load significant neighborhoods for the network using the Fisher's exact test.
120
+ """Load significant neighborhoods for the network using the Poisson test.
131
121
 
132
122
  Args:
133
123
  network (nx.Graph): The network graph.
134
- annotations (pd.DataFrame): The matrix of annotations associated with the network.
135
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
124
+ annotations (dict): The annotations associated with the network.
125
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
136
126
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
137
127
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
128
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
138
129
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
139
- max_workers (int, optional): Maximum number of workers for parallel computation. Defaults to 1.
140
130
 
141
131
  Returns:
142
132
  dict: Computed significance of neighborhoods.
143
133
  """
144
- print_header("Running Fisher's exact test")
134
+ log_header("Running Poisson test")
145
135
  # Log neighborhood analysis parameters
146
136
  params.log_neighborhoods(
147
137
  distance_metric=distance_metric,
148
138
  louvain_resolution=louvain_resolution,
149
139
  edge_length_threshold=edge_length_threshold,
150
- statistical_test_function="fisher_exact",
140
+ statistical_test_function="poisson",
141
+ null_distribution=null_distribution,
151
142
  random_seed=random_seed,
152
- max_workers=max_workers,
153
143
  )
154
144
 
155
145
  # Load neighborhoods based on the network and distance metric
@@ -160,50 +150,56 @@ class RISK(NetworkIO, AnnotationsIO):
160
150
  edge_length_threshold=edge_length_threshold,
161
151
  random_seed=random_seed,
162
152
  )
163
-
164
- # Log and display Fisher's exact test settings
165
- print(f"Maximum workers: {max_workers}")
166
- # Run Fisher's exact test to compute neighborhood significance
167
- neighborhood_significance = compute_fisher_exact_test(
153
+ # Run Poisson test to compute neighborhood significance
154
+ neighborhood_significance = compute_poisson_test(
168
155
  neighborhoods=neighborhoods,
169
156
  annotations=annotations["matrix"],
170
- max_workers=max_workers,
157
+ null_distribution=null_distribution,
171
158
  )
172
159
 
173
160
  # Return the computed neighborhood significance
174
161
  return neighborhood_significance
175
162
 
176
- def load_neighborhoods_by_hypergeom(
163
+ def load_neighborhoods_by_permutation(
177
164
  self,
178
165
  network: nx.Graph,
179
166
  annotations: Dict[str, Any],
180
- distance_metric: str = "dijkstra",
167
+ distance_metric: str = "louvain",
181
168
  louvain_resolution: float = 0.1,
182
169
  edge_length_threshold: float = 0.5,
170
+ score_metric: str = "sum",
171
+ null_distribution: str = "network",
172
+ num_permutations: int = 1000,
183
173
  random_seed: int = 888,
184
174
  max_workers: int = 1,
185
175
  ) -> Dict[str, Any]:
186
- """Load significant neighborhoods for the network using the hypergeometric test.
176
+ """Load significant neighborhoods for the network using the permutation test.
187
177
 
188
178
  Args:
189
179
  network (nx.Graph): The network graph.
190
- annotations (pd.DataFrame): The matrix of annotations associated with the network.
191
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
180
+ annotations (dict): The annotations associated with the network.
181
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
192
182
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
193
183
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
184
+ score_metric (str, optional): Scoring metric for neighborhood significance. Defaults to "sum".
185
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
186
+ num_permutations (int, optional): Number of permutations for significance testing. Defaults to 1000.
194
187
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
195
188
  max_workers (int, optional): Maximum number of workers for parallel computation. Defaults to 1.
196
189
 
197
190
  Returns:
198
191
  dict: Computed significance of neighborhoods.
199
192
  """
200
- print_header("Running hypergeometric test")
193
+ log_header("Running permutation test")
201
194
  # Log neighborhood analysis parameters
202
195
  params.log_neighborhoods(
203
196
  distance_metric=distance_metric,
204
197
  louvain_resolution=louvain_resolution,
205
198
  edge_length_threshold=edge_length_threshold,
206
- statistical_test_function="hypergeom",
199
+ statistical_test_function="permutation",
200
+ score_metric=score_metric,
201
+ null_distribution=null_distribution,
202
+ num_permutations=num_permutations,
207
203
  random_seed=random_seed,
208
204
  max_workers=max_workers,
209
205
  )
@@ -217,12 +213,19 @@ class RISK(NetworkIO, AnnotationsIO):
217
213
  random_seed=random_seed,
218
214
  )
219
215
 
220
- # Log and display hypergeometric test settings
221
- print(f"Maximum workers: {max_workers}")
222
- # Run hypergeometric test to compute neighborhood significance
223
- neighborhood_significance = compute_hypergeom_test(
216
+ # Log and display permutation test settings
217
+ logger.debug(f"Neighborhood scoring metric: '{score_metric}'")
218
+ logger.debug(f"Null distribution: '{null_distribution}'")
219
+ logger.debug(f"Number of permutations: {num_permutations}")
220
+ logger.debug(f"Maximum workers: {max_workers}")
221
+ # Run permutation test to compute neighborhood significance
222
+ neighborhood_significance = compute_permutation_test(
224
223
  neighborhoods=neighborhoods,
225
224
  annotations=annotations["matrix"],
225
+ score_metric=score_metric,
226
+ null_distribution=null_distribution,
227
+ num_permutations=num_permutations,
228
+ random_seed=random_seed,
226
229
  max_workers=max_workers,
227
230
  )
228
231
 
@@ -237,7 +240,7 @@ class RISK(NetworkIO, AnnotationsIO):
237
240
  tail: str = "right", # OPTIONS: "right" (enrichment), "left" (depletion), "both"
238
241
  pval_cutoff: float = 0.01, # OPTIONS: Any value between 0 to 1
239
242
  fdr_cutoff: float = 0.9999, # OPTIONS: Any value between 0 to 1
240
- impute_depth: int = 1,
243
+ impute_depth: int = 0,
241
244
  prune_threshold: float = 0.0,
242
245
  linkage_criterion: str = "distance",
243
246
  linkage_method: str = "average",
@@ -254,7 +257,7 @@ class RISK(NetworkIO, AnnotationsIO):
254
257
  tail (str, optional): Type of significance tail ("right", "left", "both"). Defaults to "right".
255
258
  pval_cutoff (float, optional): p-value cutoff for significance. Defaults to 0.01.
256
259
  fdr_cutoff (float, optional): FDR cutoff for significance. Defaults to 0.9999.
257
- impute_depth (int, optional): Depth for imputing neighbors. Defaults to 1.
260
+ impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
258
261
  prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
259
262
  linkage_criterion (str, optional): Clustering criterion for defining domains. Defaults to "distance".
260
263
  linkage_method (str, optional): Clustering method to use. Defaults to "average".
@@ -266,7 +269,7 @@ class RISK(NetworkIO, AnnotationsIO):
266
269
  NetworkGraph: A fully initialized and processed NetworkGraph object.
267
270
  """
268
271
  # Log the parameters and display headers
269
- print_header("Finding significant neighborhoods")
272
+ log_header("Finding significant neighborhoods")
270
273
  params.log_graph(
271
274
  tail=tail,
272
275
  pval_cutoff=pval_cutoff,
@@ -280,9 +283,9 @@ class RISK(NetworkIO, AnnotationsIO):
280
283
  max_cluster_size=max_cluster_size,
281
284
  )
282
285
 
283
- print(f"p-value cutoff: {pval_cutoff}")
284
- print(f"FDR BH cutoff: {fdr_cutoff}")
285
- print(
286
+ logger.debug(f"p-value cutoff: {pval_cutoff}")
287
+ logger.debug(f"FDR BH cutoff: {fdr_cutoff}")
288
+ logger.debug(
286
289
  f"Significance tail: '{tail}' ({'enrichment' if tail == 'right' else 'depletion' if tail == 'left' else 'both'})"
287
290
  )
288
291
  # Calculate significant neighborhoods based on the provided parameters
@@ -294,7 +297,7 @@ class RISK(NetworkIO, AnnotationsIO):
294
297
  fdr_cutoff=fdr_cutoff,
295
298
  )
296
299
 
297
- print_header("Processing neighborhoods")
300
+ log_header("Processing neighborhoods")
298
301
  # Process neighborhoods by imputing and pruning based on the given settings
299
302
  processed_neighborhoods = process_neighborhoods(
300
303
  network=network,
@@ -303,9 +306,9 @@ class RISK(NetworkIO, AnnotationsIO):
303
306
  prune_threshold=prune_threshold,
304
307
  )
305
308
 
306
- print_header("Finding top annotations")
307
- print(f"Min cluster size: {min_cluster_size}")
308
- print(f"Max cluster size: {max_cluster_size}")
309
+ log_header("Finding top annotations")
310
+ logger.debug(f"Min cluster size: {min_cluster_size}")
311
+ logger.debug(f"Max cluster size: {max_cluster_size}")
309
312
  # Define top annotations based on processed neighborhoods
310
313
  top_annotations = self._define_top_annotations(
311
314
  network=network,
@@ -315,7 +318,7 @@ class RISK(NetworkIO, AnnotationsIO):
315
318
  max_cluster_size=max_cluster_size,
316
319
  )
317
320
 
318
- print_header(f"Optimizing distance threshold for domains")
321
+ log_header("Optimizing distance threshold for domains")
319
322
  # Define domains in the network using the specified clustering settings
320
323
  domains = self._define_domains(
321
324
  neighborhoods=processed_neighborhoods,
@@ -363,7 +366,7 @@ class RISK(NetworkIO, AnnotationsIO):
363
366
  Returns:
364
367
  NetworkPlotter: A NetworkPlotter object configured with the given parameters.
365
368
  """
366
- print_header("Loading plotter")
369
+ log_header("Loading plotter")
367
370
  # Log the plotter settings
368
371
  params.log_plotter(
369
372
  figsize=figsize,
@@ -380,7 +383,7 @@ class RISK(NetworkIO, AnnotationsIO):
380
383
  def _load_neighborhoods(
381
384
  self,
382
385
  network: nx.Graph,
383
- distance_metric: str = "dijkstra",
386
+ distance_metric: str = "louvain",
384
387
  louvain_resolution: float = 0.1,
385
388
  edge_length_threshold: float = 0.5,
386
389
  random_seed: int = 888,
@@ -390,7 +393,7 @@ class RISK(NetworkIO, AnnotationsIO):
390
393
  Args:
391
394
  network (nx.Graph): The network graph.
392
395
  annotations (pd.DataFrame): The matrix of annotations associated with the network.
393
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
396
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
394
397
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
395
398
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
396
399
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
@@ -404,9 +407,9 @@ class RISK(NetworkIO, AnnotationsIO):
404
407
  else:
405
408
  for_print_distance_metric = distance_metric
406
409
  # Log and display neighborhood settings
407
- print(f"Distance metric: '{for_print_distance_metric}'")
408
- print(f"Edge length threshold: {edge_length_threshold}")
409
- print(f"Random seed: {random_seed}")
410
+ logger.debug(f"Distance metric: '{for_print_distance_metric}'")
411
+ logger.debug(f"Edge length threshold: {edge_length_threshold}")
412
+ logger.debug(f"Random seed: {random_seed}")
410
413
 
411
414
  # Compute neighborhoods based on the network and distance metric
412
415
  neighborhoods = get_network_neighborhoods(
risk/stats/__init__.py CHANGED
@@ -3,7 +3,7 @@ risk/stats
3
3
  ~~~~~~~~~~
4
4
  """
5
5
 
6
- from .stats import calculate_significance_matrices
7
- from .fisher_exact import compute_fisher_exact_test
8
6
  from .hypergeom import compute_hypergeom_test
9
7
  from .permutation import compute_permutation_test
8
+ from .poisson import compute_poisson_test
9
+ from .stats import calculate_significance_matrices
risk/stats/hypergeom.py CHANGED
@@ -3,8 +3,6 @@ risk/stats/hypergeom
3
3
  ~~~~~~~~~~~~~~~~~~~~
4
4
  """
5
5
 
6
- from multiprocessing import get_context, Manager
7
- from tqdm import tqdm
8
6
  from typing import Any, Dict
9
7
 
10
8
  import numpy as np
@@ -12,120 +10,45 @@ from scipy.stats import hypergeom
12
10
 
13
11
 
14
12
  def compute_hypergeom_test(
15
- neighborhoods: np.ndarray,
16
- annotations: np.ndarray,
17
- max_workers: int = 4,
13
+ neighborhoods: np.ndarray, annotations: np.ndarray, null_distribution: str = "network"
18
14
  ) -> Dict[str, Any]:
19
- """Compute hypergeometric test for enrichment and depletion in neighborhoods.
15
+ """Compute hypergeometric test for enrichment and depletion in neighborhoods with selectable null distribution.
20
16
 
21
17
  Args:
22
18
  neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
23
19
  annotations (np.ndarray): Binary matrix representing annotations.
24
- max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
20
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
25
21
 
26
22
  Returns:
27
23
  dict: Dictionary containing depletion and enrichment p-values.
28
24
  """
29
- # Ensure that the matrices are binary (boolean) and free of NaN values
30
- neighborhoods = neighborhoods.astype(bool) # Convert to boolean
31
- annotations = annotations.astype(bool) # Convert to boolean
32
-
33
- # Initialize the process of calculating p-values using multiprocessing
34
- ctx = get_context("spawn")
35
- manager = Manager()
36
- progress_counter = manager.Value("i", 0)
37
- total_tasks = neighborhoods.shape[1] * annotations.shape[1]
38
-
39
- # Calculate the workload per worker
40
- chunk_size = total_tasks // max_workers
41
- remainder = total_tasks % max_workers
42
-
43
- # Execute the hypergeometric test using multiprocessing
44
- with ctx.Pool(max_workers) as pool:
45
- with tqdm(total=total_tasks, desc="Total progress", position=0) as progress:
46
- params_list = []
47
- start_idx = 0
48
- for i in range(max_workers):
49
- end_idx = start_idx + chunk_size + (1 if i < remainder else 0)
50
- params_list.append(
51
- (neighborhoods, annotations, start_idx, end_idx, progress_counter)
52
- )
53
- start_idx = end_idx
54
-
55
- # Start the hypergeometric test process in parallel
56
- results = pool.starmap_async(_hypergeom_process_subset, params_list, chunksize=1)
57
-
58
- # Update progress bar based on progress_counter
59
- while not results.ready():
60
- progress.update(progress_counter.value - progress.n)
61
- results.wait(0.05) # Wait for 50ms
62
- # Ensure progress bar reaches 100%
63
- progress.update(total_tasks - progress.n)
64
-
65
- # Accumulate results from each worker
66
- depletion_pvals, enrichment_pvals = [], []
67
- for dp, ep in results.get():
68
- depletion_pvals.extend(dp)
69
- enrichment_pvals.extend(ep)
70
-
71
- # Reshape the results back into arrays with the appropriate dimensions
72
- depletion_pvals = np.array(depletion_pvals).reshape(
73
- neighborhoods.shape[1], annotations.shape[1]
25
+ # Get the total number of nodes in the network
26
+ total_node_count = neighborhoods.shape[0]
27
+
28
+ if null_distribution == "network":
29
+ # Case 1: Use all nodes as the background
30
+ background_population = total_node_count
31
+ neighborhood_sums = np.sum(neighborhoods, axis=0, keepdims=True).T
32
+ annotation_sums = np.sum(annotations, axis=0, keepdims=True)
33
+ elif null_distribution == "annotations":
34
+ # Case 2: Only consider nodes with at least one annotation
35
+ annotated_nodes = np.sum(annotations, axis=1) > 0
36
+ background_population = np.sum(annotated_nodes)
37
+ neighborhood_sums = np.sum(neighborhoods[annotated_nodes], axis=0, keepdims=True).T
38
+ annotation_sums = np.sum(annotations[annotated_nodes], axis=0, keepdims=True)
39
+ else:
40
+ raise ValueError(
41
+ "Invalid null_distribution value. Choose either 'network' or 'annotations'."
42
+ )
43
+
44
+ # Matrix multiplication for annotated nodes in each neighborhood
45
+ annotated_in_neighborhood = neighborhoods.T @ annotations
46
+ # Calculate depletion and enrichment p-values using the hypergeometric distribution
47
+ depletion_pvals = hypergeom.cdf(
48
+ annotated_in_neighborhood, background_population, annotation_sums, neighborhood_sums
74
49
  )
75
- enrichment_pvals = np.array(enrichment_pvals).reshape(
76
- neighborhoods.shape[1], annotations.shape[1]
50
+ enrichment_pvals = hypergeom.sf(
51
+ annotated_in_neighborhood - 1, background_population, annotation_sums, neighborhood_sums
77
52
  )
78
53
 
79
- return {
80
- "depletion_pvals": depletion_pvals,
81
- "enrichment_pvals": enrichment_pvals,
82
- }
83
-
84
-
85
- def _hypergeom_process_subset(
86
- neighborhoods: np.ndarray,
87
- annotations: np.ndarray,
88
- start_idx: int,
89
- end_idx: int,
90
- progress_counter,
91
- ) -> tuple:
92
- """Process a subset of neighborhoods using the hypergeometric test.
93
-
94
- Args:
95
- neighborhoods (np.ndarray): The full neighborhood matrix.
96
- annotations (np.ndarray): The annotation matrix.
97
- start_idx (int): Starting index of the neighborhood-annotation pairs to process.
98
- end_idx (int): Ending index of the neighborhood-annotation pairs to process.
99
- progress_counter: Shared counter for tracking progress.
100
-
101
- Returns:
102
- tuple: Local p-values for depletion and enrichment.
103
- """
104
- # Initialize lists to store p-values for depletion and enrichment
105
- depletion_pvals = []
106
- enrichment_pvals = []
107
- # Process the subset of tasks assigned to this worker
108
- for idx in range(start_idx, end_idx):
109
- i = idx // annotations.shape[1] # Neighborhood index
110
- j = idx % annotations.shape[1] # Annotation index
111
-
112
- neighborhood = neighborhoods[:, i]
113
- annotation = annotations[:, j]
114
-
115
- # Calculate the required values for the hypergeometric test
116
- M = annotations.shape[0] # Total number of items (population size)
117
- n = np.sum(annotation) # Total number of successes in population
118
- N = np.sum(neighborhood) # Total number of draws (sample size)
119
- k = np.sum(neighborhood & annotation) # Number of successes in sample
120
-
121
- # Perform hypergeometric test for depletion
122
- p_value_depletion = hypergeom.cdf(k, M, n, N)
123
- depletion_pvals.append(p_value_depletion)
124
- # Perform hypergeometric test for enrichment
125
- p_value_enrichment = hypergeom.sf(k - 1, M, n, N)
126
- enrichment_pvals.append(p_value_enrichment)
127
-
128
- # Update the shared progress counter
129
- progress_counter.value += 1
130
-
131
- return depletion_pvals, enrichment_pvals
54
+ return {"depletion_pvals": depletion_pvals, "enrichment_pvals": enrichment_pvals}