chatspatial 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatspatial/__init__.py +11 -0
- chatspatial/__main__.py +141 -0
- chatspatial/cli/__init__.py +7 -0
- chatspatial/config.py +53 -0
- chatspatial/models/__init__.py +85 -0
- chatspatial/models/analysis.py +513 -0
- chatspatial/models/data.py +2462 -0
- chatspatial/server.py +1763 -0
- chatspatial/spatial_mcp_adapter.py +720 -0
- chatspatial/tools/__init__.py +3 -0
- chatspatial/tools/annotation.py +1903 -0
- chatspatial/tools/cell_communication.py +1603 -0
- chatspatial/tools/cnv_analysis.py +605 -0
- chatspatial/tools/condition_comparison.py +595 -0
- chatspatial/tools/deconvolution/__init__.py +402 -0
- chatspatial/tools/deconvolution/base.py +318 -0
- chatspatial/tools/deconvolution/card.py +244 -0
- chatspatial/tools/deconvolution/cell2location.py +326 -0
- chatspatial/tools/deconvolution/destvi.py +144 -0
- chatspatial/tools/deconvolution/flashdeconv.py +101 -0
- chatspatial/tools/deconvolution/rctd.py +317 -0
- chatspatial/tools/deconvolution/spotlight.py +216 -0
- chatspatial/tools/deconvolution/stereoscope.py +109 -0
- chatspatial/tools/deconvolution/tangram.py +135 -0
- chatspatial/tools/differential.py +625 -0
- chatspatial/tools/embeddings.py +298 -0
- chatspatial/tools/enrichment.py +1863 -0
- chatspatial/tools/integration.py +807 -0
- chatspatial/tools/preprocessing.py +723 -0
- chatspatial/tools/spatial_domains.py +808 -0
- chatspatial/tools/spatial_genes.py +836 -0
- chatspatial/tools/spatial_registration.py +441 -0
- chatspatial/tools/spatial_statistics.py +1476 -0
- chatspatial/tools/trajectory.py +495 -0
- chatspatial/tools/velocity.py +405 -0
- chatspatial/tools/visualization/__init__.py +155 -0
- chatspatial/tools/visualization/basic.py +393 -0
- chatspatial/tools/visualization/cell_comm.py +699 -0
- chatspatial/tools/visualization/cnv.py +320 -0
- chatspatial/tools/visualization/core.py +684 -0
- chatspatial/tools/visualization/deconvolution.py +852 -0
- chatspatial/tools/visualization/enrichment.py +660 -0
- chatspatial/tools/visualization/integration.py +205 -0
- chatspatial/tools/visualization/main.py +164 -0
- chatspatial/tools/visualization/multi_gene.py +739 -0
- chatspatial/tools/visualization/persistence.py +335 -0
- chatspatial/tools/visualization/spatial_stats.py +469 -0
- chatspatial/tools/visualization/trajectory.py +639 -0
- chatspatial/tools/visualization/velocity.py +411 -0
- chatspatial/utils/__init__.py +115 -0
- chatspatial/utils/adata_utils.py +1372 -0
- chatspatial/utils/compute.py +327 -0
- chatspatial/utils/data_loader.py +499 -0
- chatspatial/utils/dependency_manager.py +462 -0
- chatspatial/utils/device_utils.py +165 -0
- chatspatial/utils/exceptions.py +185 -0
- chatspatial/utils/image_utils.py +267 -0
- chatspatial/utils/mcp_utils.py +137 -0
- chatspatial/utils/path_utils.py +243 -0
- chatspatial/utils/persistence.py +78 -0
- chatspatial/utils/scipy_compat.py +143 -0
- chatspatial-1.1.0.dist-info/METADATA +242 -0
- chatspatial-1.1.0.dist-info/RECORD +67 -0
- chatspatial-1.1.0.dist-info/WHEEL +5 -0
- chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
- chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
- chatspatial-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1476 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A module for quantitative spatial analysis of spatial transcriptomics data.
|
|
3
|
+
|
|
4
|
+
This module provides a collection of functions to compute various spatial
|
|
5
|
+
statistics. It includes methods for assessing global and local spatial
|
|
6
|
+
autocorrelation, analyzing neighborhood compositions, and evaluating spatial
|
|
7
|
+
patterns of cell clusters.
|
|
8
|
+
|
|
9
|
+
Key functionalities include:
|
|
10
|
+
- Global spatial autocorrelation (Moran's I, Geary's C).
|
|
11
|
+
- Local spatial autocorrelation (Local Moran's I / LISA for cluster detection).
|
|
12
|
+
- Local spatial statistics for hotspot detection (Getis-Ord Gi*).
|
|
13
|
+
- Cluster-based analysis (Neighborhood Enrichment, Co-occurrence, Ripley's K).
|
|
14
|
+
- Spatial network analysis (Centrality Scores, Network Properties).
|
|
15
|
+
- Bivariate spatial correlation analysis (Bivariate Moran's I).
|
|
16
|
+
- Categorical spatial analysis (Join Count statistics).
|
|
17
|
+
- Spatial centrality measures for tissue architecture.
|
|
18
|
+
|
|
19
|
+
The primary entry point is the `analyze_spatial_statistics` function, which
|
|
20
|
+
dispatches tasks to the appropriate analysis function based on user parameters.
|
|
21
|
+
All 12 analysis types are accessible through this unified interface with a
|
|
22
|
+
new unified 'genes' parameter for consistent gene selection across methods.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
28
|
+
|
|
29
|
+
import anndata as ad
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pandas as pd
|
|
32
|
+
import squidpy as sq
|
|
33
|
+
|
|
34
|
+
from ..utils.dependency_manager import is_available, require
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from ..spatial_mcp_adapter import ToolContext
|
|
38
|
+
|
|
39
|
+
from ..models.analysis import SpatialStatisticsResult
|
|
40
|
+
from ..models.data import SpatialStatisticsParameters
|
|
41
|
+
from ..utils.adata_utils import (
|
|
42
|
+
ensure_categorical,
|
|
43
|
+
require_spatial_coords,
|
|
44
|
+
select_genes_for_analysis,
|
|
45
|
+
to_dense,
|
|
46
|
+
validate_adata_basics,
|
|
47
|
+
)
|
|
48
|
+
from ..utils.compute import ensure_spatial_neighbors_async
|
|
49
|
+
from ..utils.exceptions import (
|
|
50
|
+
DataCompatibilityError,
|
|
51
|
+
DataNotFoundError,
|
|
52
|
+
DependencyError,
|
|
53
|
+
ParameterError,
|
|
54
|
+
ProcessingError,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# ============================================================================
|
|
58
|
+
# MAIN ENTRY POINT
|
|
59
|
+
# ============================================================================
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
async def analyze_spatial_statistics(
|
|
63
|
+
data_id: str,
|
|
64
|
+
ctx: ToolContext,
|
|
65
|
+
params: SpatialStatisticsParameters, # No default - must be provided by caller (LLM)
|
|
66
|
+
) -> SpatialStatisticsResult:
|
|
67
|
+
"""
|
|
68
|
+
Serves as the central dispatcher for executing various spatial analysis methods.
|
|
69
|
+
|
|
70
|
+
This function validates the input data, computes a spatial neighbor graph if one
|
|
71
|
+
does not exist, and routes the analysis to the appropriate specialized function
|
|
72
|
+
based on the `analysis_type` parameter. Results from the analysis are added to
|
|
73
|
+
the `AnnData` object within the data store. Note that visualization is handled
|
|
74
|
+
by a separate function.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
data_id : str
|
|
79
|
+
The identifier for the dataset.
|
|
80
|
+
ctx : ToolContext
|
|
81
|
+
Tool context for data access and logging.
|
|
82
|
+
params : SpatialStatisticsParameters
|
|
83
|
+
An object containing the parameters for the analysis, including the
|
|
84
|
+
specific `analysis_type` to perform.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
SpatialStatisticsResult
|
|
89
|
+
An object containing the statistical results and metadata from the analysis.
|
|
90
|
+
|
|
91
|
+
Raises
|
|
92
|
+
------
|
|
93
|
+
DataNotFoundError
|
|
94
|
+
If the specified dataset is not found in the data store.
|
|
95
|
+
ParameterError
|
|
96
|
+
If the provided parameters are not valid for the requested analysis.
|
|
97
|
+
ProcessingError
|
|
98
|
+
If an error occurs during the execution of the analysis.
|
|
99
|
+
"""
|
|
100
|
+
# Validate parameters
|
|
101
|
+
supported_types = [
|
|
102
|
+
"neighborhood",
|
|
103
|
+
"co_occurrence",
|
|
104
|
+
"ripley",
|
|
105
|
+
"moran",
|
|
106
|
+
"local_moran", # Added Local Moran's I
|
|
107
|
+
"geary",
|
|
108
|
+
"centrality",
|
|
109
|
+
"getis_ord",
|
|
110
|
+
"bivariate_moran",
|
|
111
|
+
"join_count", # Traditional Join Count for binary data (2 categories)
|
|
112
|
+
"local_join_count", # Local Join Count for multi-category data (>2 categories)
|
|
113
|
+
"network_properties",
|
|
114
|
+
"spatial_centrality",
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
if params.analysis_type not in supported_types:
|
|
118
|
+
raise ParameterError(f"Unsupported analysis type: {params.analysis_type}")
|
|
119
|
+
|
|
120
|
+
if params.n_neighbors <= 0:
|
|
121
|
+
raise ParameterError(f"n_neighbors must be positive, got {params.n_neighbors}")
|
|
122
|
+
|
|
123
|
+
# Retrieve dataset via ToolContext
|
|
124
|
+
try:
|
|
125
|
+
adata = await ctx.get_adata(data_id)
|
|
126
|
+
|
|
127
|
+
# Basic validation: min 10 cells, spatial coordinates exist
|
|
128
|
+
validate_adata_basics(adata, min_obs=10)
|
|
129
|
+
require_spatial_coords(adata) # Validates spatial coords exist
|
|
130
|
+
|
|
131
|
+
# Determine if cluster_key is required for this analysis type
|
|
132
|
+
analyses_requiring_cluster_key = {
|
|
133
|
+
"neighborhood",
|
|
134
|
+
"co_occurrence",
|
|
135
|
+
"ripley",
|
|
136
|
+
"join_count",
|
|
137
|
+
"local_join_count",
|
|
138
|
+
"centrality",
|
|
139
|
+
"network_properties",
|
|
140
|
+
"spatial_centrality",
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# Ensure cluster key only for analyses that require it
|
|
144
|
+
cluster_key = None
|
|
145
|
+
if params.analysis_type in analyses_requiring_cluster_key:
|
|
146
|
+
if params.cluster_key not in adata.obs.columns:
|
|
147
|
+
available = [
|
|
148
|
+
c
|
|
149
|
+
for c in adata.obs.columns
|
|
150
|
+
if "cluster" in c.lower() or c in ["leiden", "louvain"]
|
|
151
|
+
]
|
|
152
|
+
raise DataNotFoundError(
|
|
153
|
+
f"Cluster key '{params.cluster_key}' not found. "
|
|
154
|
+
f"Available: {available if available else 'None'}. "
|
|
155
|
+
f"Run preprocess_data() first to generate clusters."
|
|
156
|
+
)
|
|
157
|
+
ensure_categorical(adata, params.cluster_key)
|
|
158
|
+
cluster_key = params.cluster_key
|
|
159
|
+
|
|
160
|
+
# Ensure spatial neighbors
|
|
161
|
+
await ensure_spatial_neighbors_async(adata, ctx, n_neighs=params.n_neighbors)
|
|
162
|
+
|
|
163
|
+
# Route to appropriate analysis function
|
|
164
|
+
if params.analysis_type == "moran":
|
|
165
|
+
result = _analyze_morans_i(adata, params, ctx)
|
|
166
|
+
elif params.analysis_type == "local_moran":
|
|
167
|
+
result = _analyze_local_moran(adata, params, ctx)
|
|
168
|
+
elif params.analysis_type == "geary":
|
|
169
|
+
result = _analyze_gearys_c(adata, params, ctx)
|
|
170
|
+
elif params.analysis_type == "neighborhood":
|
|
171
|
+
result = _analyze_neighborhood_enrichment(adata, cluster_key, ctx)
|
|
172
|
+
elif params.analysis_type == "co_occurrence":
|
|
173
|
+
result = _analyze_co_occurrence(adata, cluster_key, ctx)
|
|
174
|
+
elif params.analysis_type == "ripley":
|
|
175
|
+
result = _analyze_ripleys_k(adata, cluster_key, ctx)
|
|
176
|
+
elif params.analysis_type == "getis_ord":
|
|
177
|
+
result = _analyze_getis_ord(adata, params, ctx)
|
|
178
|
+
elif params.analysis_type == "centrality":
|
|
179
|
+
result = _analyze_centrality(adata, cluster_key, ctx)
|
|
180
|
+
elif params.analysis_type == "bivariate_moran":
|
|
181
|
+
result = _analyze_bivariate_moran(adata, params, ctx)
|
|
182
|
+
elif params.analysis_type == "join_count":
|
|
183
|
+
result = _analyze_join_count(adata, cluster_key, params, ctx)
|
|
184
|
+
elif params.analysis_type == "local_join_count":
|
|
185
|
+
result = _analyze_local_join_count(adata, cluster_key, params, ctx)
|
|
186
|
+
elif params.analysis_type == "network_properties":
|
|
187
|
+
result = _analyze_network_properties(adata, cluster_key, params, ctx)
|
|
188
|
+
elif params.analysis_type == "spatial_centrality":
|
|
189
|
+
result = _analyze_spatial_centrality(adata, cluster_key, params, ctx)
|
|
190
|
+
else:
|
|
191
|
+
raise ParameterError(
|
|
192
|
+
f"Analysis type {params.analysis_type} not implemented"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# COW FIX: No need to update data_store - changes already reflected via direct reference
|
|
196
|
+
# All modifications to adata.obs/uns/obsp are in-place and preserved
|
|
197
|
+
|
|
198
|
+
# Ensure result is a dictionary
|
|
199
|
+
if not isinstance(result, dict):
|
|
200
|
+
if hasattr(result, "dict"):
|
|
201
|
+
result = result.dict()
|
|
202
|
+
else:
|
|
203
|
+
raise ProcessingError("Invalid result format from analysis function")
|
|
204
|
+
|
|
205
|
+
# Add metadata
|
|
206
|
+
result.update(
|
|
207
|
+
{
|
|
208
|
+
"n_cells": adata.n_obs,
|
|
209
|
+
"n_neighbors": params.n_neighbors,
|
|
210
|
+
}
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Store scientific metadata for reproducibility
|
|
214
|
+
from ..utils.adata_utils import store_analysis_metadata
|
|
215
|
+
|
|
216
|
+
# Determine results keys based on analysis type
|
|
217
|
+
results_keys_dict = {"obs": [], "var": [], "obsm": [], "uns": []}
|
|
218
|
+
if params.analysis_type in ["moran", "geary"]:
|
|
219
|
+
results_keys_dict["uns"].append(f"{params.analysis_type}s_i")
|
|
220
|
+
elif params.analysis_type == "local_moran":
|
|
221
|
+
results_keys_dict["obs"].extend(
|
|
222
|
+
[f"{gene}_local_moran" for gene in (params.genes or [])]
|
|
223
|
+
)
|
|
224
|
+
elif params.analysis_type == "getis_ord":
|
|
225
|
+
if params.genes:
|
|
226
|
+
for gene in params.genes:
|
|
227
|
+
results_keys_dict["obs"].extend(
|
|
228
|
+
[f"{gene}_getis_ord_z", f"{gene}_getis_ord_p"]
|
|
229
|
+
)
|
|
230
|
+
elif params.analysis_type in ["neighborhood", "co_occurrence"]:
|
|
231
|
+
results_keys_dict["uns"].append(params.analysis_type)
|
|
232
|
+
elif params.analysis_type == "ripley":
|
|
233
|
+
results_keys_dict["uns"].append("ripley")
|
|
234
|
+
elif params.analysis_type == "centrality":
|
|
235
|
+
results_keys_dict["uns"].append("centrality_scores")
|
|
236
|
+
|
|
237
|
+
# Prepare parameters dict
|
|
238
|
+
parameters_dict = {
|
|
239
|
+
"n_neighbors": params.n_neighbors,
|
|
240
|
+
}
|
|
241
|
+
if cluster_key:
|
|
242
|
+
parameters_dict["cluster_key"] = cluster_key
|
|
243
|
+
if params.genes:
|
|
244
|
+
parameters_dict["genes"] = params.genes
|
|
245
|
+
# Add n_perms based on analysis type
|
|
246
|
+
if params.analysis_type in ["moran", "local_moran", "geary"]:
|
|
247
|
+
parameters_dict["n_perms"] = params.moran_n_perms
|
|
248
|
+
|
|
249
|
+
# Extract statistics for metadata
|
|
250
|
+
statistics_dict = {
|
|
251
|
+
"n_cells": adata.n_obs,
|
|
252
|
+
}
|
|
253
|
+
if "n_significant" in result:
|
|
254
|
+
statistics_dict["n_significant"] = result["n_significant"]
|
|
255
|
+
if "mean_score" in result:
|
|
256
|
+
statistics_dict["mean_score"] = result["mean_score"]
|
|
257
|
+
|
|
258
|
+
# Store metadata
|
|
259
|
+
store_analysis_metadata(
|
|
260
|
+
adata,
|
|
261
|
+
analysis_name=f"spatial_stats_{params.analysis_type}",
|
|
262
|
+
method=params.analysis_type,
|
|
263
|
+
parameters=parameters_dict,
|
|
264
|
+
results_keys=results_keys_dict,
|
|
265
|
+
statistics=statistics_dict,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Extract summary fields for MCP response (detailed statistics excluded)
|
|
269
|
+
summary = _extract_result_summary(result, params.analysis_type)
|
|
270
|
+
|
|
271
|
+
return SpatialStatisticsResult(
|
|
272
|
+
data_id=data_id,
|
|
273
|
+
analysis_type=params.analysis_type,
|
|
274
|
+
n_features_analyzed=summary["n_features_analyzed"],
|
|
275
|
+
n_significant=summary["n_significant"],
|
|
276
|
+
top_features=summary["top_features"],
|
|
277
|
+
summary_metrics=summary["summary_metrics"],
|
|
278
|
+
results_key=summary.get("results_key"),
|
|
279
|
+
statistics=result, # Excluded from MCP response via Field(exclude=True)
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
except (DataNotFoundError, ParameterError, DataCompatibilityError):
|
|
283
|
+
raise
|
|
284
|
+
except Exception as e:
|
|
285
|
+
raise ProcessingError(
|
|
286
|
+
f"Error in {params.analysis_type} analysis: {e}"
|
|
287
|
+
) from e
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# ============================================================================
|
|
291
|
+
# HELPER FUNCTIONS
|
|
292
|
+
# ============================================================================
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _extract_result_summary(
|
|
296
|
+
result: dict[str, Any], analysis_type: str
|
|
297
|
+
) -> dict[str, Any]:
|
|
298
|
+
"""Extract compact summary from analysis result for MCP response.
|
|
299
|
+
|
|
300
|
+
This function extracts the most informative fields from detailed analysis results,
|
|
301
|
+
keeping the MCP response small while preserving actionable insights for the LLM.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
result: Full result dictionary from analysis function
|
|
305
|
+
analysis_type: Type of spatial analysis performed
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
Dictionary with standardized summary fields:
|
|
309
|
+
- n_features_analyzed: Number of genes/clusters analyzed
|
|
310
|
+
- n_significant: Number of significant results
|
|
311
|
+
- top_features: List of top significant features (max 10)
|
|
312
|
+
- summary_metrics: Key numeric metrics
|
|
313
|
+
- results_key: Key in adata.uns for full results (if applicable)
|
|
314
|
+
"""
|
|
315
|
+
summary: dict[str, Any] = {
|
|
316
|
+
"n_features_analyzed": 0,
|
|
317
|
+
"n_significant": 0,
|
|
318
|
+
"top_features": [],
|
|
319
|
+
"summary_metrics": {},
|
|
320
|
+
"results_key": None,
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
# Extract based on analysis type
|
|
324
|
+
if analysis_type == "moran":
|
|
325
|
+
summary["n_features_analyzed"] = result.get("n_genes_analyzed", 0)
|
|
326
|
+
summary["n_significant"] = result.get("n_significant", 0)
|
|
327
|
+
summary["top_features"] = result.get("top_highest_autocorrelation", [])[:10]
|
|
328
|
+
summary["summary_metrics"] = {"mean_morans_i": result.get("mean_morans_i", 0.0)}
|
|
329
|
+
summary["results_key"] = result.get("analysis_key")
|
|
330
|
+
|
|
331
|
+
elif analysis_type == "geary":
|
|
332
|
+
summary["n_features_analyzed"] = result.get("n_genes_analyzed", 0)
|
|
333
|
+
summary["summary_metrics"] = {"mean_gearys_c": result.get("mean_gearys_c", 0.0)}
|
|
334
|
+
summary["results_key"] = result.get("analysis_key")
|
|
335
|
+
|
|
336
|
+
elif analysis_type == "local_moran":
|
|
337
|
+
summary["n_features_analyzed"] = result.get("n_genes_analyzed", 0)
|
|
338
|
+
summary["n_significant"] = result.get("n_significant_total", 0)
|
|
339
|
+
summary["top_features"] = result.get("top_clustered_genes", [])[:10]
|
|
340
|
+
summary["summary_metrics"] = {
|
|
341
|
+
"mean_hotspots": result.get("mean_hotspots_per_gene", 0.0),
|
|
342
|
+
"mean_coldspots": result.get("mean_coldspots_per_gene", 0.0),
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
elif analysis_type == "getis_ord":
|
|
346
|
+
genes_analyzed = result.get("genes_analyzed", [])
|
|
347
|
+
summary["n_features_analyzed"] = len(genes_analyzed)
|
|
348
|
+
summary["top_features"] = genes_analyzed[:10]
|
|
349
|
+
# Count total hotspots across all genes
|
|
350
|
+
per_gene_results = result.get("results", {})
|
|
351
|
+
total_hot = sum(r.get("n_hot_spots", 0) for r in per_gene_results.values())
|
|
352
|
+
total_cold = sum(r.get("n_cold_spots", 0) for r in per_gene_results.values())
|
|
353
|
+
summary["summary_metrics"] = {
|
|
354
|
+
"total_hotspots": total_hot,
|
|
355
|
+
"total_coldspots": total_cold,
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
elif analysis_type == "neighborhood":
|
|
359
|
+
summary["n_features_analyzed"] = result.get("n_clusters", 0)
|
|
360
|
+
summary["summary_metrics"] = {
|
|
361
|
+
"max_enrichment": result.get("max_enrichment", 0.0),
|
|
362
|
+
"min_enrichment": result.get("min_enrichment", 0.0),
|
|
363
|
+
}
|
|
364
|
+
summary["results_key"] = result.get("analysis_key")
|
|
365
|
+
|
|
366
|
+
elif analysis_type == "co_occurrence":
|
|
367
|
+
summary["n_features_analyzed"] = result.get("n_clusters", 0)
|
|
368
|
+
summary["results_key"] = result.get("analysis_key")
|
|
369
|
+
|
|
370
|
+
elif analysis_type == "ripley":
|
|
371
|
+
summary["n_features_analyzed"] = result.get("n_clusters", 0)
|
|
372
|
+
summary["results_key"] = result.get("analysis_key")
|
|
373
|
+
|
|
374
|
+
elif analysis_type == "centrality":
|
|
375
|
+
summary["n_features_analyzed"] = result.get("n_clusters", 0)
|
|
376
|
+
summary["results_key"] = result.get("analysis_key")
|
|
377
|
+
|
|
378
|
+
elif analysis_type == "bivariate_moran":
|
|
379
|
+
pairs = result.get("gene_pairs", [])
|
|
380
|
+
summary["n_features_analyzed"] = len(pairs)
|
|
381
|
+
summary["top_features"] = [f"{p[0]}-{p[1]}" for p in pairs[:10]]
|
|
382
|
+
# Extract significant correlations
|
|
383
|
+
per_pair = result.get("results", {})
|
|
384
|
+
significant = [k for k, v in per_pair.items() if abs(v.get("moran_i", 0)) > 0.3]
|
|
385
|
+
summary["n_significant"] = len(significant)
|
|
386
|
+
|
|
387
|
+
elif analysis_type in ["join_count", "local_join_count"]:
|
|
388
|
+
summary["n_features_analyzed"] = result.get("n_categories", 0)
|
|
389
|
+
summary["n_significant"] = result.get("n_significant", 0)
|
|
390
|
+
summary["results_key"] = result.get("analysis_key")
|
|
391
|
+
|
|
392
|
+
elif analysis_type in ["network_properties", "spatial_centrality"]:
|
|
393
|
+
summary["results_key"] = result.get("analysis_key")
|
|
394
|
+
summary["summary_metrics"] = {
|
|
395
|
+
k: v
|
|
396
|
+
for k, v in result.items()
|
|
397
|
+
if isinstance(v, (int, float)) and k not in ("n_cells", "n_neighbors")
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
return summary
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def _get_optimal_n_jobs(n_obs: int, requested_n_jobs: Optional[int] = None) -> int:
|
|
404
|
+
"""Determine optimal number of parallel jobs based on data size."""
|
|
405
|
+
import os
|
|
406
|
+
|
|
407
|
+
if requested_n_jobs is not None:
|
|
408
|
+
if requested_n_jobs == -1:
|
|
409
|
+
return os.cpu_count() or 1
|
|
410
|
+
return requested_n_jobs
|
|
411
|
+
|
|
412
|
+
# Smart defaults based on data size
|
|
413
|
+
if n_obs < 1000:
|
|
414
|
+
return 1 # Single thread for small data
|
|
415
|
+
elif n_obs < 5000:
|
|
416
|
+
return min(2, os.cpu_count() or 1)
|
|
417
|
+
else:
|
|
418
|
+
return min(4, os.cpu_count() or 1)
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
# ============================================================================
|
|
422
|
+
# CORE ANALYSIS FUNCTIONS
|
|
423
|
+
# ============================================================================
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def _analyze_morans_i(
|
|
427
|
+
adata: ad.AnnData,
|
|
428
|
+
params: SpatialStatisticsParameters,
|
|
429
|
+
ctx: "ToolContext",
|
|
430
|
+
) -> dict[str, Any]:
|
|
431
|
+
"""
|
|
432
|
+
Calculates Moran's I to measure global spatial autocorrelation for genes.
|
|
433
|
+
|
|
434
|
+
Moran's I is a statistic that indicates whether the expression of a gene is
|
|
435
|
+
spatially clustered, dispersed, or randomly distributed.
|
|
436
|
+
- A value near +1.0 indicates strong clustering of similar expression values.
|
|
437
|
+
- A value near -1.0 indicates dispersion (a checkerboard-like pattern).
|
|
438
|
+
- A value near 0 indicates a random spatial distribution.
|
|
439
|
+
|
|
440
|
+
The analysis is performed on highly variable genes by default, but a
|
|
441
|
+
specific gene list can be provided.
|
|
442
|
+
"""
|
|
443
|
+
# Unified gene selection
|
|
444
|
+
genes = select_genes_for_analysis(
|
|
445
|
+
adata,
|
|
446
|
+
genes=params.genes,
|
|
447
|
+
n_genes=params.n_top_genes,
|
|
448
|
+
analysis_name="Moran's I",
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# Optimize parallelization
|
|
452
|
+
n_jobs = _get_optimal_n_jobs(adata.n_obs, params.n_jobs)
|
|
453
|
+
|
|
454
|
+
# Run spatial autocorrelation
|
|
455
|
+
sq.gr.spatial_autocorr(
|
|
456
|
+
adata,
|
|
457
|
+
mode="moran",
|
|
458
|
+
genes=genes,
|
|
459
|
+
n_perms=params.moran_n_perms,
|
|
460
|
+
two_tailed=params.moran_two_tailed,
|
|
461
|
+
n_jobs=n_jobs,
|
|
462
|
+
backend=params.backend,
|
|
463
|
+
show_progress_bar=False,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
# Extract results
|
|
467
|
+
moran_key = "moranI"
|
|
468
|
+
if moran_key in adata.uns:
|
|
469
|
+
results_df = adata.uns[moran_key]
|
|
470
|
+
|
|
471
|
+
# Get top significant genes
|
|
472
|
+
significant_genes = results_df[results_df["pval_norm"] < 0.05].index.tolist()
|
|
473
|
+
|
|
474
|
+
# Calculate appropriate number of top genes to return
|
|
475
|
+
# To avoid returning identical lists, we take at most half of the analyzed genes
|
|
476
|
+
# This ensures top_highest and top_lowest are different gene sets
|
|
477
|
+
n_analyzed = len(results_df)
|
|
478
|
+
n_top = min(10, max(3, n_analyzed // 2))
|
|
479
|
+
|
|
480
|
+
# Ensure we never return more than half the genes to avoid duplicates
|
|
481
|
+
n_top = min(n_top, n_analyzed // 2) if n_analyzed >= 6 else 0
|
|
482
|
+
|
|
483
|
+
return {
|
|
484
|
+
"n_genes_analyzed": len(genes),
|
|
485
|
+
"n_significant": len(significant_genes),
|
|
486
|
+
"top_highest_autocorrelation": (
|
|
487
|
+
results_df.nlargest(n_top, "I").index.tolist() if n_top > 0 else []
|
|
488
|
+
),
|
|
489
|
+
"top_lowest_autocorrelation": (
|
|
490
|
+
results_df.nsmallest(n_top, "I").index.tolist() if n_top > 0 else []
|
|
491
|
+
),
|
|
492
|
+
"mean_morans_i": float(results_df["I"].mean()),
|
|
493
|
+
"analysis_key": moran_key,
|
|
494
|
+
"note": "top_highest/top_lowest refer to autocorrelation strength, not positive/negative correlation",
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
raise ProcessingError("Moran's I computation did not produce results")
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def _analyze_gearys_c(
|
|
501
|
+
adata: ad.AnnData,
|
|
502
|
+
params: SpatialStatisticsParameters,
|
|
503
|
+
ctx: "ToolContext",
|
|
504
|
+
) -> dict[str, Any]:
|
|
505
|
+
"""Compute Geary's C spatial autocorrelation."""
|
|
506
|
+
# Unified gene selection
|
|
507
|
+
genes = select_genes_for_analysis(
|
|
508
|
+
adata,
|
|
509
|
+
genes=params.genes,
|
|
510
|
+
n_genes=params.n_top_genes,
|
|
511
|
+
analysis_name="Geary's C",
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
sq.gr.spatial_autocorr(
|
|
515
|
+
adata,
|
|
516
|
+
mode="geary",
|
|
517
|
+
genes=genes,
|
|
518
|
+
n_perms=params.moran_n_perms,
|
|
519
|
+
n_jobs=_get_optimal_n_jobs(adata.n_obs, params.n_jobs),
|
|
520
|
+
show_progress_bar=False,
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
# Extract results (squidpy returns DataFrame, not dict)
|
|
524
|
+
geary_key = "gearyC"
|
|
525
|
+
if geary_key in adata.uns:
|
|
526
|
+
import pandas as pd
|
|
527
|
+
|
|
528
|
+
results_df = adata.uns[geary_key]
|
|
529
|
+
if isinstance(results_df, pd.DataFrame):
|
|
530
|
+
return {
|
|
531
|
+
"n_genes_analyzed": len(genes),
|
|
532
|
+
"mean_gearys_c": float(results_df["C"].mean()),
|
|
533
|
+
"analysis_key": geary_key,
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
raise ProcessingError("Geary's C computation did not produce results")
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def _analyze_neighborhood_enrichment(
|
|
540
|
+
adata: ad.AnnData,
|
|
541
|
+
cluster_key: str,
|
|
542
|
+
ctx: "ToolContext",
|
|
543
|
+
) -> dict[str, Any]:
|
|
544
|
+
"""Compute neighborhood enrichment analysis."""
|
|
545
|
+
sq.gr.nhood_enrichment(adata, cluster_key=cluster_key)
|
|
546
|
+
|
|
547
|
+
analysis_key = f"{cluster_key}_nhood_enrichment"
|
|
548
|
+
if analysis_key in adata.uns:
|
|
549
|
+
z_scores = adata.uns[analysis_key]["zscore"]
|
|
550
|
+
|
|
551
|
+
# Use nanmax/nanmin to handle NaN values from sparse cell type distributions
|
|
552
|
+
# NaN can occur when certain cell type pairs have insufficient neighborhoods
|
|
553
|
+
return {
|
|
554
|
+
"n_clusters": len(z_scores),
|
|
555
|
+
"max_enrichment": float(np.nanmax(z_scores)),
|
|
556
|
+
"min_enrichment": float(np.nanmin(z_scores)),
|
|
557
|
+
"analysis_key": analysis_key,
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
raise ProcessingError("Neighborhood enrichment did not produce results")
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _analyze_co_occurrence(
|
|
564
|
+
adata: ad.AnnData,
|
|
565
|
+
cluster_key: str,
|
|
566
|
+
ctx: "ToolContext",
|
|
567
|
+
) -> dict[str, Any]:
|
|
568
|
+
"""Compute co-occurrence analysis."""
|
|
569
|
+
sq.gr.co_occurrence(adata, cluster_key=cluster_key)
|
|
570
|
+
|
|
571
|
+
analysis_key = f"{cluster_key}_co_occurrence"
|
|
572
|
+
if analysis_key in adata.uns:
|
|
573
|
+
co_occurrence = adata.uns[analysis_key]["occ"]
|
|
574
|
+
|
|
575
|
+
return {"n_clusters": len(co_occurrence), "analysis_key": analysis_key}
|
|
576
|
+
|
|
577
|
+
raise ProcessingError("Co-occurrence analysis did not produce results")
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def _analyze_ripleys_k(
|
|
581
|
+
adata: ad.AnnData,
|
|
582
|
+
cluster_key: str,
|
|
583
|
+
ctx: "ToolContext",
|
|
584
|
+
) -> dict[str, Any]:
|
|
585
|
+
"""Compute Ripley's K function."""
|
|
586
|
+
try:
|
|
587
|
+
sq.gr.ripley(
|
|
588
|
+
adata,
|
|
589
|
+
cluster_key=cluster_key,
|
|
590
|
+
mode="L", # L-function (variance-stabilized)
|
|
591
|
+
n_simulations=20,
|
|
592
|
+
n_observations=min(1000, adata.n_obs),
|
|
593
|
+
max_dist=None,
|
|
594
|
+
n_steps=50,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
analysis_key = f"{cluster_key}_ripley_L"
|
|
598
|
+
return {"analysis_completed": True, "analysis_key": analysis_key}
|
|
599
|
+
except Exception as e:
|
|
600
|
+
raise ProcessingError(f"Ripley's K analysis failed: {e}") from e
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def _analyze_getis_ord(
|
|
604
|
+
adata: ad.AnnData,
|
|
605
|
+
params: SpatialStatisticsParameters,
|
|
606
|
+
ctx: "ToolContext",
|
|
607
|
+
) -> dict[str, Any]:
|
|
608
|
+
"""
|
|
609
|
+
Performs Getis-Ord Gi* analysis to identify local spatial clusters.
|
|
610
|
+
|
|
611
|
+
This method identifies statistically significant hot spots (clusters of high
|
|
612
|
+
gene expression) and cold spots (clusters of low gene expression). It computes
|
|
613
|
+
a Z-score for each spot, where high positive Z-scores indicate hot spots and
|
|
614
|
+
low negative Z-scores indicate cold spots.
|
|
615
|
+
|
|
616
|
+
The significance threshold is determined by params.getis_ord_alpha, and
|
|
617
|
+
multiple testing correction is applied according to params.getis_ord_correction.
|
|
618
|
+
|
|
619
|
+
References
|
|
620
|
+
----------
|
|
621
|
+
Getis, A. & Ord, J.K. (1992). The Analysis of Spatial Association by Use of
|
|
622
|
+
Distance Statistics. Geographical Analysis, 24(3), 189-206.
|
|
623
|
+
|
|
624
|
+
Ord, J.K. & Getis, A. (1995). Local Spatial Autocorrelation Statistics:
|
|
625
|
+
Distributional Issues and an Application. Geographical Analysis, 27(4), 286-306.
|
|
626
|
+
"""
|
|
627
|
+
# Unified gene selection
|
|
628
|
+
genes = select_genes_for_analysis(
|
|
629
|
+
adata,
|
|
630
|
+
genes=params.genes,
|
|
631
|
+
n_genes=params.n_top_genes,
|
|
632
|
+
analysis_name="Getis-Ord Gi*",
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
getis_ord_results = {}
|
|
636
|
+
|
|
637
|
+
require("esda") # Raises ImportError with install instructions if missing
|
|
638
|
+
require("libpysal") # Raises ImportError with install instructions if missing
|
|
639
|
+
from esda.getisord import G_Local
|
|
640
|
+
from pysal.lib import weights
|
|
641
|
+
from scipy.stats import norm
|
|
642
|
+
|
|
643
|
+
try:
|
|
644
|
+
|
|
645
|
+
# Calculate Z-score threshold from alpha level (two-tailed test)
|
|
646
|
+
z_threshold = norm.ppf(1 - params.getis_ord_alpha / 2)
|
|
647
|
+
|
|
648
|
+
coords = require_spatial_coords(adata)
|
|
649
|
+
w = weights.KNN.from_array(coords, k=params.n_neighbors)
|
|
650
|
+
w.transform = "r"
|
|
651
|
+
|
|
652
|
+
# OPTIMIZATION: Extract all genes at once before loop (batch extraction)
|
|
653
|
+
# This provides 50-150x speedup by avoiding repeated AnnData slicing overhead
|
|
654
|
+
y_all_genes = to_dense(adata[:, genes].X)
|
|
655
|
+
|
|
656
|
+
# Collect all p-values for multiple testing correction
|
|
657
|
+
all_pvalues = {}
|
|
658
|
+
|
|
659
|
+
for i, gene in enumerate(genes):
|
|
660
|
+
# OPTIMIZATION: Direct indexing from pre-extracted dense matrix (fast!)
|
|
661
|
+
y = y_all_genes[:, i].astype(np.float64)
|
|
662
|
+
|
|
663
|
+
local_g = G_Local(y, w, transform="R", star=True)
|
|
664
|
+
|
|
665
|
+
# Store raw results in adata.obs
|
|
666
|
+
adata.obs[f"{gene}_getis_ord_z"] = local_g.Zs
|
|
667
|
+
adata.obs[f"{gene}_getis_ord_p"] = local_g.p_sim
|
|
668
|
+
|
|
669
|
+
# Store p-values for correction
|
|
670
|
+
all_pvalues[gene] = local_g.p_sim
|
|
671
|
+
|
|
672
|
+
# Count hotspots/coldspots using Z-threshold
|
|
673
|
+
getis_ord_results[gene] = {
|
|
674
|
+
"mean_z": float(np.mean(local_g.Zs)),
|
|
675
|
+
"std_z": float(np.std(local_g.Zs)),
|
|
676
|
+
"n_hot_spots": int(np.sum(local_g.Zs > z_threshold)),
|
|
677
|
+
"n_cold_spots": int(np.sum(local_g.Zs < -z_threshold)),
|
|
678
|
+
"n_significant_raw": int(
|
|
679
|
+
np.sum(local_g.p_sim < params.getis_ord_alpha)
|
|
680
|
+
),
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
# Apply multiple testing correction if requested
|
|
684
|
+
if params.getis_ord_correction != "none" and len(genes) > 1:
|
|
685
|
+
if params.getis_ord_correction == "bonferroni":
|
|
686
|
+
corrected_alpha = params.getis_ord_alpha / len(genes)
|
|
687
|
+
corrected_z_threshold = norm.ppf(1 - corrected_alpha / 2)
|
|
688
|
+
|
|
689
|
+
for gene in genes:
|
|
690
|
+
p_values = all_pvalues[gene]
|
|
691
|
+
adata.obs[f"{gene}_getis_ord_p_corrected"] = np.minimum(
|
|
692
|
+
p_values * len(genes), 1.0
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
z_scores = adata.obs[f"{gene}_getis_ord_z"].values
|
|
696
|
+
getis_ord_results[gene]["n_hot_spots_corrected"] = int(
|
|
697
|
+
np.sum(z_scores > corrected_z_threshold)
|
|
698
|
+
)
|
|
699
|
+
getis_ord_results[gene]["n_cold_spots_corrected"] = int(
|
|
700
|
+
np.sum(z_scores < -corrected_z_threshold)
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
elif params.getis_ord_correction == "fdr_bh":
|
|
704
|
+
from statsmodels.stats.multitest import multipletests
|
|
705
|
+
|
|
706
|
+
for gene in genes:
|
|
707
|
+
p_values = all_pvalues[gene]
|
|
708
|
+
_, p_corrected, _, _ = multipletests(
|
|
709
|
+
p_values, alpha=params.getis_ord_alpha, method="fdr_bh"
|
|
710
|
+
)
|
|
711
|
+
adata.obs[f"{gene}_getis_ord_p_corrected"] = p_corrected
|
|
712
|
+
|
|
713
|
+
getis_ord_results[gene]["n_significant_corrected"] = int(
|
|
714
|
+
np.sum(p_corrected < params.getis_ord_alpha)
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
z_scores = adata.obs[f"{gene}_getis_ord_z"].values
|
|
718
|
+
significant_mask = p_corrected < params.getis_ord_alpha
|
|
719
|
+
getis_ord_results[gene]["n_hot_spots_corrected"] = int(
|
|
720
|
+
np.sum((z_scores > z_threshold) & significant_mask)
|
|
721
|
+
)
|
|
722
|
+
getis_ord_results[gene]["n_cold_spots_corrected"] = int(
|
|
723
|
+
np.sum((z_scores < -z_threshold) & significant_mask)
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
except Exception as e:
|
|
727
|
+
raise ProcessingError(f"Getis-Ord analysis failed: {e}") from e
|
|
728
|
+
|
|
729
|
+
return {
|
|
730
|
+
"method": "Getis-Ord Gi* (star=True)",
|
|
731
|
+
"n_genes_analyzed": len(getis_ord_results),
|
|
732
|
+
"genes_analyzed": list(getis_ord_results),
|
|
733
|
+
"parameters": {
|
|
734
|
+
"n_neighbors": params.n_neighbors,
|
|
735
|
+
"alpha": params.getis_ord_alpha,
|
|
736
|
+
"z_threshold": float(z_threshold),
|
|
737
|
+
"correction": params.getis_ord_correction,
|
|
738
|
+
},
|
|
739
|
+
"results": getis_ord_results,
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
def _analyze_centrality(
|
|
744
|
+
adata: ad.AnnData,
|
|
745
|
+
cluster_key: str,
|
|
746
|
+
ctx: "ToolContext",
|
|
747
|
+
) -> dict[str, Any]:
|
|
748
|
+
"""Compute centrality scores."""
|
|
749
|
+
sq.gr.centrality_scores(adata, cluster_key=cluster_key)
|
|
750
|
+
|
|
751
|
+
analysis_key = f"{cluster_key}_centrality_scores"
|
|
752
|
+
if analysis_key in adata.uns:
|
|
753
|
+
scores = adata.uns[analysis_key]
|
|
754
|
+
|
|
755
|
+
return {
|
|
756
|
+
"analysis_completed": True,
|
|
757
|
+
"analysis_key": analysis_key,
|
|
758
|
+
"n_clusters": len(scores) if isinstance(scores, dict) else "computed",
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
raise ProcessingError("Centrality analysis did not produce results")
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
# ============================================================================
|
|
765
|
+
# ADVANCED ANALYSIS FUNCTIONS (from spatial_statistics.py)
|
|
766
|
+
# ============================================================================
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def _analyze_bivariate_moran(
|
|
770
|
+
adata: ad.AnnData,
|
|
771
|
+
params: SpatialStatisticsParameters,
|
|
772
|
+
ctx: "ToolContext",
|
|
773
|
+
) -> dict[str, Any]:
|
|
774
|
+
"""
|
|
775
|
+
Calculates Bivariate Moran's I to assess spatial correlation between two genes.
|
|
776
|
+
|
|
777
|
+
This statistic measures how the expression of one gene in a specific location
|
|
778
|
+
relates to the expression of a second gene in neighboring locations. It is useful
|
|
779
|
+
for identifying pairs of genes that are co-localized or spatially exclusive.
|
|
780
|
+
A positive value suggests that high expression of gene A is surrounded by high
|
|
781
|
+
expression of gene B.
|
|
782
|
+
"""
|
|
783
|
+
# Get gene pairs from parameters - NO ARBITRARY DEFAULTS
|
|
784
|
+
if not params.gene_pairs:
|
|
785
|
+
raise ParameterError("Bivariate Moran's I requires gene_pairs parameter.")
|
|
786
|
+
gene_pairs = params.gene_pairs
|
|
787
|
+
|
|
788
|
+
results = {}
|
|
789
|
+
|
|
790
|
+
# Use centralized dependency manager for consistent error handling
|
|
791
|
+
require("libpysal") # Raises ImportError with install instructions if missing
|
|
792
|
+
from libpysal.weights import KNN
|
|
793
|
+
|
|
794
|
+
try:
|
|
795
|
+
|
|
796
|
+
coords = require_spatial_coords(adata)
|
|
797
|
+
w = KNN.from_array(coords, k=params.n_neighbors)
|
|
798
|
+
w.transform = "R"
|
|
799
|
+
|
|
800
|
+
# OPTIMIZATION: Extract all unique genes involved in pairs (batch extraction)
|
|
801
|
+
# This provides 20-40x speedup by avoiding repeated AnnData slicing
|
|
802
|
+
# See test_spatial_statistics_extreme_scale.py for performance validation
|
|
803
|
+
all_genes_in_pairs = list(
|
|
804
|
+
set([g for pair in gene_pairs for g in pair if g in adata.var_names])
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
expr_all = to_dense(adata[:, all_genes_in_pairs].X)
|
|
808
|
+
|
|
809
|
+
# Create gene index mapping for fast lookup
|
|
810
|
+
gene_to_idx = {gene: i for i, gene in enumerate(all_genes_in_pairs)}
|
|
811
|
+
|
|
812
|
+
for gene1, gene2 in gene_pairs:
|
|
813
|
+
if gene1 in adata.var_names and gene2 in adata.var_names:
|
|
814
|
+
# OPTIMIZATION: Direct indexing from pre-extracted matrix (fast!)
|
|
815
|
+
idx1 = gene_to_idx[gene1]
|
|
816
|
+
idx2 = gene_to_idx[gene2]
|
|
817
|
+
x = expr_all[:, idx1].flatten()
|
|
818
|
+
y = expr_all[:, idx2].flatten()
|
|
819
|
+
|
|
820
|
+
# Compute bivariate Moran's I using sparse matrix operations
|
|
821
|
+
# Formula: I_xy = (n / S0) * (x - x̄)ᵀ W (y - ȳ) / sqrt(Var(x) * Var(y))
|
|
822
|
+
# Reference: Wartenberg (1985), Anselin et al. (2002)
|
|
823
|
+
n = len(x)
|
|
824
|
+
x_mean = np.mean(x)
|
|
825
|
+
y_mean = np.mean(y)
|
|
826
|
+
|
|
827
|
+
# Centered values
|
|
828
|
+
x_centered = x - x_mean
|
|
829
|
+
y_centered = y - y_mean
|
|
830
|
+
|
|
831
|
+
# OPTIMIZED: Use sparse matrix multiplication instead of O(n²) loop
|
|
832
|
+
# numerator = Σᵢ Σⱼ wᵢⱼ(xᵢ - x̄)(yⱼ - ȳ) = (x - x̄)ᵀ @ W @ (y - ȳ)
|
|
833
|
+
numerator = float(x_centered @ w.sparse @ y_centered)
|
|
834
|
+
|
|
835
|
+
# FIX: Bivariate Moran's I uses sqrt of product of both variances
|
|
836
|
+
# Not just x's variance (which was the bug)
|
|
837
|
+
var_x = np.sum(x_centered**2)
|
|
838
|
+
var_y = np.sum(y_centered**2)
|
|
839
|
+
denominator = np.sqrt(var_x * var_y)
|
|
840
|
+
|
|
841
|
+
if denominator > 0:
|
|
842
|
+
moran_i = (n / w.sparse.sum()) * (numerator / denominator)
|
|
843
|
+
else:
|
|
844
|
+
moran_i = 0.0
|
|
845
|
+
|
|
846
|
+
results[f"{gene1}_vs_{gene2}"] = float(moran_i)
|
|
847
|
+
|
|
848
|
+
except Exception as e:
|
|
849
|
+
raise ProcessingError(f"Bivariate Moran's I failed: {e}") from e
|
|
850
|
+
|
|
851
|
+
return {
|
|
852
|
+
"n_pairs_analyzed": len(results),
|
|
853
|
+
"bivariate_morans_i": results,
|
|
854
|
+
"mean_bivariate_i": float(np.mean(list(results.values()))) if results else 0,
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
|
|
858
|
+
def _analyze_join_count(
|
|
859
|
+
adata: ad.AnnData,
|
|
860
|
+
cluster_key: str,
|
|
861
|
+
params: SpatialStatisticsParameters,
|
|
862
|
+
ctx: "ToolContext",
|
|
863
|
+
) -> dict[str, Any]:
|
|
864
|
+
"""
|
|
865
|
+
Compute traditional Join Count statistics for BINARY categorical spatial data.
|
|
866
|
+
|
|
867
|
+
IMPORTANT: This method only works for binary data (exactly 2 categories).
|
|
868
|
+
For multi-category data (>2 categories), use 'local_join_count' instead.
|
|
869
|
+
|
|
870
|
+
Join Count statistics (Cliff & Ord 1981) measure spatial autocorrelation in
|
|
871
|
+
binary categorical data by counting the number of joins between neighboring
|
|
872
|
+
spatial units of the same or different categories.
|
|
873
|
+
|
|
874
|
+
Returns three types of joins:
|
|
875
|
+
- BB (Black-Black): Both neighbors are category 1
|
|
876
|
+
- WW (White-White): Both neighbors are category 0
|
|
877
|
+
- BW (Black-White): Neighbors are different categories
|
|
878
|
+
|
|
879
|
+
Parameters
|
|
880
|
+
----------
|
|
881
|
+
adata : AnnData
|
|
882
|
+
Annotated data object with spatial coordinates in .obsm['spatial']
|
|
883
|
+
cluster_key : str
|
|
884
|
+
Column in adata.obs containing the categorical variable (must have exactly 2 categories)
|
|
885
|
+
params : SpatialStatisticsParameters
|
|
886
|
+
Analysis parameters including n_neighbors
|
|
887
|
+
ctx : ToolContext
|
|
888
|
+
ToolContext for logging and data access
|
|
889
|
+
|
|
890
|
+
Returns
|
|
891
|
+
-------
|
|
892
|
+
Dict[str, Any]
|
|
893
|
+
Dictionary containing:
|
|
894
|
+
- bb: Number of Black-Black joins
|
|
895
|
+
- ww: Number of White-White joins
|
|
896
|
+
- bw: Number of Black-White joins
|
|
897
|
+
- J: Total number of joins
|
|
898
|
+
- p_value: Significance level from permutation test
|
|
899
|
+
|
|
900
|
+
References
|
|
901
|
+
----------
|
|
902
|
+
Cliff, A.D. & Ord, J.K. (1981). Spatial Processes. Pion, London.
|
|
903
|
+
|
|
904
|
+
See Also
|
|
905
|
+
--------
|
|
906
|
+
_analyze_local_join_count : For multi-category data (>2 categories)
|
|
907
|
+
"""
|
|
908
|
+
# Check for required dependencies
|
|
909
|
+
if not is_available("esda") or not is_available("libpysal"):
|
|
910
|
+
raise DependencyError(
|
|
911
|
+
"esda or libpysal package not installed. Install with: pip install esda libpysal"
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
try:
|
|
915
|
+
from esda.join_counts import Join_Counts
|
|
916
|
+
from libpysal.weights import KNN
|
|
917
|
+
|
|
918
|
+
coords = require_spatial_coords(adata)
|
|
919
|
+
w = KNN.from_array(coords, k=params.n_neighbors)
|
|
920
|
+
|
|
921
|
+
# Get categorical data
|
|
922
|
+
y = adata.obs[cluster_key].cat.codes.values
|
|
923
|
+
|
|
924
|
+
# Compute join counts
|
|
925
|
+
jc = Join_Counts(y, w)
|
|
926
|
+
|
|
927
|
+
return {
|
|
928
|
+
"bb": float(jc.bb), # Black-Black joins
|
|
929
|
+
"ww": float(jc.ww), # White-White joins
|
|
930
|
+
"bw": float(jc.bw), # Black-White joins
|
|
931
|
+
"J": float(jc.J), # Total joins
|
|
932
|
+
"p_value": float(jc.p_sim) if hasattr(jc, "p_sim") else None,
|
|
933
|
+
}
|
|
934
|
+
|
|
935
|
+
except Exception as e:
|
|
936
|
+
raise ProcessingError(f"Join Count analysis failed: {e}") from e
|
|
937
|
+
|
|
938
|
+
|
|
939
|
+
def _analyze_local_join_count(
|
|
940
|
+
adata: ad.AnnData,
|
|
941
|
+
cluster_key: str,
|
|
942
|
+
params: SpatialStatisticsParameters,
|
|
943
|
+
ctx: "ToolContext",
|
|
944
|
+
) -> dict[str, Any]:
|
|
945
|
+
"""
|
|
946
|
+
Compute Local Join Count statistics for MULTI-CATEGORY categorical spatial data.
|
|
947
|
+
|
|
948
|
+
This method extends traditional Join Count statistics to handle data with more than
|
|
949
|
+
2 categories by using Local Join Count Statistics (Anselin & Li 2019). Each category
|
|
950
|
+
is converted to a binary indicator variable, and local statistics are computed to
|
|
951
|
+
identify spatial clusters of each category.
|
|
952
|
+
|
|
953
|
+
WHEN TO USE:
|
|
954
|
+
- Data has MORE THAN 2 categories (e.g., cell types, tissue domains)
|
|
955
|
+
- Want to identify WHERE each category spatially clusters
|
|
956
|
+
- Need category-specific clustering patterns
|
|
957
|
+
|
|
958
|
+
For binary data (exactly 2 categories), use 'join_count' instead for traditional
|
|
959
|
+
global statistics.
|
|
960
|
+
|
|
961
|
+
METHOD:
|
|
962
|
+
1. One-hot encode: Convert multi-category variable to binary indicators
|
|
963
|
+
2. For each category: Compute local join count (# of same-category neighbors)
|
|
964
|
+
3. Permutation test: Assess statistical significance
|
|
965
|
+
4. Store results: Local statistics in adata.obs, summary in return value
|
|
966
|
+
|
|
967
|
+
Parameters
|
|
968
|
+
----------
|
|
969
|
+
adata : AnnData
|
|
970
|
+
Annotated data object with spatial coordinates in .obsm['spatial']
|
|
971
|
+
cluster_key : str
|
|
972
|
+
Column in adata.obs containing the categorical variable (can have any number of categories)
|
|
973
|
+
params : SpatialStatisticsParameters
|
|
974
|
+
Analysis parameters including n_neighbors
|
|
975
|
+
ctx : ToolContext
|
|
976
|
+
ToolContext for logging and data access
|
|
977
|
+
|
|
978
|
+
Returns
|
|
979
|
+
-------
|
|
980
|
+
Dict[str, Any]
|
|
981
|
+
Dictionary containing:
|
|
982
|
+
- method: Method name and reference
|
|
983
|
+
- n_categories: Number of categories analyzed
|
|
984
|
+
- categories: List of category names
|
|
985
|
+
- per_category_stats: Statistics for each category
|
|
986
|
+
- total_joins: Sum of local join counts across all locations
|
|
987
|
+
- mean_local_joins: Average local join count per location
|
|
988
|
+
- n_significant: Number of locations with significant clustering (p < 0.05)
|
|
989
|
+
- n_hotspots: Number of locations with positive significant clustering
|
|
990
|
+
- interpretation: How to interpret the results
|
|
991
|
+
|
|
992
|
+
Notes
|
|
993
|
+
-----
|
|
994
|
+
Results are stored in adata.obs as:
|
|
995
|
+
- 'ljc_{category}': Local join count values for each category
|
|
996
|
+
- 'ljc_{category}_pvalue': Significance levels (from permutation test)
|
|
997
|
+
|
|
998
|
+
High local join count values indicate locations where category members cluster together.
|
|
999
|
+
P-values < 0.05 indicate statistically significant local clustering.
|
|
1000
|
+
|
|
1001
|
+
References
|
|
1002
|
+
----------
|
|
1003
|
+
Anselin, L., & Li, X. (2019). Operational Local Join Count Statistics for Cluster Detection.
|
|
1004
|
+
Journal of geographical systems, 21(2), 189–210.
|
|
1005
|
+
https://doi.org/10.1007/s10109-019-00299-x
|
|
1006
|
+
|
|
1007
|
+
See Also
|
|
1008
|
+
--------
|
|
1009
|
+
_analyze_join_count : For binary data (2 categories) using traditional Join Count
|
|
1010
|
+
|
|
1011
|
+
Examples
|
|
1012
|
+
--------
|
|
1013
|
+
For a dataset with 7 cell type categories:
|
|
1014
|
+
>>> result = await _analyze_local_join_count(adata, 'leiden', params, ctx)
|
|
1015
|
+
>>> # Check which cell types show significant clustering
|
|
1016
|
+
>>> for cat, stats in result['per_category_stats'].items():
|
|
1017
|
+
... print(f"{cat}: {stats['n_hotspots']} significant hotspots")
|
|
1018
|
+
"""
|
|
1019
|
+
# Check for required dependencies
|
|
1020
|
+
if not is_available("esda") or not is_available("libpysal"):
|
|
1021
|
+
raise DependencyError(
|
|
1022
|
+
"esda or libpysal package not installed (requires esda >= 2.4.0). "
|
|
1023
|
+
"Install with: pip install esda libpysal"
|
|
1024
|
+
)
|
|
1025
|
+
|
|
1026
|
+
try:
|
|
1027
|
+
from esda.join_counts_local import Join_Counts_Local
|
|
1028
|
+
from libpysal.weights import KNN
|
|
1029
|
+
|
|
1030
|
+
coords = require_spatial_coords(adata)
|
|
1031
|
+
|
|
1032
|
+
# Create PySAL W object directly from coordinates using KNN
|
|
1033
|
+
# This ensures compatibility with Join_Counts_Local
|
|
1034
|
+
w = KNN.from_array(coords, k=params.n_neighbors)
|
|
1035
|
+
|
|
1036
|
+
# Get unique categories
|
|
1037
|
+
categories = adata.obs[cluster_key].unique()
|
|
1038
|
+
n_categories = len(categories)
|
|
1039
|
+
|
|
1040
|
+
results = {}
|
|
1041
|
+
|
|
1042
|
+
# Analyze each category separately
|
|
1043
|
+
for category in categories:
|
|
1044
|
+
# Create binary indicator: 1 if cell is this category, 0 otherwise
|
|
1045
|
+
y = (adata.obs[cluster_key] == category).astype(int).values
|
|
1046
|
+
|
|
1047
|
+
# Compute Local Join Count statistics
|
|
1048
|
+
ljc = Join_Counts_Local(connectivity=w).fit(y)
|
|
1049
|
+
|
|
1050
|
+
# Store local statistics in adata.obs
|
|
1051
|
+
adata.obs[f"ljc_{category}"] = ljc.LJC
|
|
1052
|
+
adata.obs[f"ljc_{category}_pvalue"] = ljc.p_sim
|
|
1053
|
+
|
|
1054
|
+
# Compute summary statistics
|
|
1055
|
+
results[str(category)] = {
|
|
1056
|
+
"total_joins": float(ljc.LJC.sum()),
|
|
1057
|
+
"mean_local_joins": float(ljc.LJC.mean()),
|
|
1058
|
+
"std_local_joins": float(ljc.LJC.std()),
|
|
1059
|
+
"n_significant": int((ljc.p_sim < 0.05).sum()),
|
|
1060
|
+
"n_hotspots": int(((ljc.LJC > 0) & (ljc.p_sim < 0.05)).sum()),
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
# Store summary in adata.uns
|
|
1064
|
+
adata.uns["local_join_count"] = {
|
|
1065
|
+
"method": "Local Join Count Statistics (Anselin & Li 2019)",
|
|
1066
|
+
"cluster_key": cluster_key,
|
|
1067
|
+
"n_categories": n_categories,
|
|
1068
|
+
"categories": [str(c) for c in categories],
|
|
1069
|
+
"n_neighbors": params.n_neighbors,
|
|
1070
|
+
"per_category_stats": results,
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
return {
|
|
1074
|
+
"method": "Local Join Count Statistics (Anselin & Li 2019)",
|
|
1075
|
+
"n_categories": n_categories,
|
|
1076
|
+
"categories": [str(c) for c in categories],
|
|
1077
|
+
"per_category_stats": results,
|
|
1078
|
+
"interpretation": (
|
|
1079
|
+
"Local Join Count statistics identify spatial clusters for each category. "
|
|
1080
|
+
"High LJC values indicate locations where category members cluster together. "
|
|
1081
|
+
"P-values < 0.05 indicate statistically significant local clustering. "
|
|
1082
|
+
"Results stored in adata.obs as 'ljc_{category}' and 'ljc_{category}_pvalue'."
|
|
1083
|
+
),
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
except Exception as e:
|
|
1087
|
+
raise ProcessingError(f"Local Join Count analysis failed: {e}") from e
|
|
1088
|
+
|
|
1089
|
+
|
|
1090
|
+
def _analyze_network_properties(
|
|
1091
|
+
adata: ad.AnnData,
|
|
1092
|
+
cluster_key: str,
|
|
1093
|
+
params: SpatialStatisticsParameters,
|
|
1094
|
+
ctx: "ToolContext",
|
|
1095
|
+
) -> dict[str, Any]:
|
|
1096
|
+
"""
|
|
1097
|
+
Analyze network properties of spatial graph.
|
|
1098
|
+
|
|
1099
|
+
Migrated from spatial_statistics.py
|
|
1100
|
+
"""
|
|
1101
|
+
# Check for required dependencies
|
|
1102
|
+
if not is_available("networkx"):
|
|
1103
|
+
raise DependencyError(
|
|
1104
|
+
"networkx package required. Install with: pip install networkx"
|
|
1105
|
+
)
|
|
1106
|
+
|
|
1107
|
+
try:
|
|
1108
|
+
import networkx as nx
|
|
1109
|
+
from scipy.sparse import csr_matrix # noqa: F401
|
|
1110
|
+
|
|
1111
|
+
# Get or create spatial connectivity
|
|
1112
|
+
if "spatial_connectivities" in adata.obsp:
|
|
1113
|
+
conn_matrix = adata.obsp["spatial_connectivities"]
|
|
1114
|
+
else:
|
|
1115
|
+
# Create connectivity matrix
|
|
1116
|
+
from sklearn.neighbors import kneighbors_graph
|
|
1117
|
+
|
|
1118
|
+
coords = require_spatial_coords(adata)
|
|
1119
|
+
conn_matrix = kneighbors_graph(
|
|
1120
|
+
coords, n_neighbors=params.n_neighbors, mode="connectivity"
|
|
1121
|
+
)
|
|
1122
|
+
|
|
1123
|
+
# Convert to networkx graph
|
|
1124
|
+
G = nx.from_scipy_sparse_array(conn_matrix)
|
|
1125
|
+
|
|
1126
|
+
# Compute properties
|
|
1127
|
+
properties = {
|
|
1128
|
+
"n_nodes": G.number_of_nodes(),
|
|
1129
|
+
"n_edges": G.number_of_edges(),
|
|
1130
|
+
"density": float(nx.density(G)),
|
|
1131
|
+
"is_connected": nx.is_connected(G),
|
|
1132
|
+
"n_components": nx.number_connected_components(G),
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
# Additional metrics for connected graphs
|
|
1136
|
+
if properties["is_connected"]:
|
|
1137
|
+
properties["diameter"] = nx.diameter(G)
|
|
1138
|
+
properties["radius"] = nx.radius(G)
|
|
1139
|
+
else:
|
|
1140
|
+
# Analyze largest component
|
|
1141
|
+
largest_cc = max(nx.connected_components(G), key=len)
|
|
1142
|
+
G.subgraph(largest_cc)
|
|
1143
|
+
properties["largest_component_size"] = len(largest_cc)
|
|
1144
|
+
properties["largest_component_fraction"] = (
|
|
1145
|
+
len(largest_cc) / G.number_of_nodes()
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
# Clustering coefficient
|
|
1149
|
+
try:
|
|
1150
|
+
properties["avg_clustering"] = float(nx.average_clustering(G))
|
|
1151
|
+
except Exception:
|
|
1152
|
+
properties["avg_clustering"] = None
|
|
1153
|
+
|
|
1154
|
+
# Degree statistics
|
|
1155
|
+
degrees = dict(G.degree())
|
|
1156
|
+
degree_values = list(degrees.values())
|
|
1157
|
+
properties["degree_mean"] = float(np.mean(degree_values))
|
|
1158
|
+
properties["degree_std"] = float(np.std(degree_values))
|
|
1159
|
+
|
|
1160
|
+
return properties
|
|
1161
|
+
|
|
1162
|
+
except Exception as e:
|
|
1163
|
+
raise ProcessingError(f"Network properties analysis failed: {e}") from e
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
def _analyze_spatial_centrality(
|
|
1167
|
+
adata: ad.AnnData,
|
|
1168
|
+
cluster_key: str,
|
|
1169
|
+
params: SpatialStatisticsParameters,
|
|
1170
|
+
ctx: "ToolContext",
|
|
1171
|
+
) -> dict[str, Any]:
|
|
1172
|
+
"""
|
|
1173
|
+
Compute various centrality measures for spatial network.
|
|
1174
|
+
|
|
1175
|
+
Migrated from spatial_statistics.py
|
|
1176
|
+
"""
|
|
1177
|
+
# Check for required dependencies
|
|
1178
|
+
if not is_available("networkx"):
|
|
1179
|
+
raise DependencyError(
|
|
1180
|
+
"NetworkX required for centrality analysis. Install with: pip install networkx"
|
|
1181
|
+
)
|
|
1182
|
+
|
|
1183
|
+
try:
|
|
1184
|
+
import networkx as nx
|
|
1185
|
+
|
|
1186
|
+
# Get connectivity matrix
|
|
1187
|
+
if "spatial_connectivities" in adata.obsp:
|
|
1188
|
+
conn_matrix = adata.obsp["spatial_connectivities"]
|
|
1189
|
+
else:
|
|
1190
|
+
from sklearn.neighbors import kneighbors_graph
|
|
1191
|
+
|
|
1192
|
+
coords = require_spatial_coords(adata)
|
|
1193
|
+
conn_matrix = kneighbors_graph(
|
|
1194
|
+
coords, n_neighbors=params.n_neighbors, mode="connectivity"
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
# Convert to networkx
|
|
1198
|
+
G = nx.from_scipy_sparse_array(conn_matrix)
|
|
1199
|
+
|
|
1200
|
+
# Compute centrality measures (returns dict with integer keys)
|
|
1201
|
+
degree_centrality = nx.degree_centrality(G)
|
|
1202
|
+
closeness_centrality = nx.closeness_centrality(G)
|
|
1203
|
+
betweenness_centrality = nx.betweenness_centrality(G)
|
|
1204
|
+
|
|
1205
|
+
# FIX: NetworkX returns {0: val0, 1: val1, ...} with integer keys,
|
|
1206
|
+
# but adata.obs_names are strings. We need to extract values in order.
|
|
1207
|
+
# Bug: pd.Series(dict) cannot align integer keys to string obs_names
|
|
1208
|
+
n_nodes = adata.n_obs
|
|
1209
|
+
|
|
1210
|
+
# Validate that all expected node keys exist in centrality results
|
|
1211
|
+
# This catches edge cases like disconnected graphs or isolated nodes
|
|
1212
|
+
expected_keys = set(range(n_nodes))
|
|
1213
|
+
missing_degree = expected_keys - set(degree_centrality.keys())
|
|
1214
|
+
missing_closeness = expected_keys - set(closeness_centrality.keys())
|
|
1215
|
+
missing_betweenness = expected_keys - set(betweenness_centrality.keys())
|
|
1216
|
+
|
|
1217
|
+
if missing_degree or missing_closeness or missing_betweenness:
|
|
1218
|
+
await ctx.warning(
|
|
1219
|
+
f"Centrality computation incomplete: "
|
|
1220
|
+
f"missing degree={len(missing_degree)}, "
|
|
1221
|
+
f"closeness={len(missing_closeness)}, "
|
|
1222
|
+
f"betweenness={len(missing_betweenness)} nodes. "
|
|
1223
|
+
f"Graph may have disconnected components."
|
|
1224
|
+
)
|
|
1225
|
+
|
|
1226
|
+
# Use .get() with default 0.0 for missing nodes (isolated/disconnected)
|
|
1227
|
+
degree_vals = np.array([degree_centrality.get(i, 0.0) for i in range(n_nodes)])
|
|
1228
|
+
closeness_vals = np.array(
|
|
1229
|
+
[closeness_centrality.get(i, 0.0) for i in range(n_nodes)]
|
|
1230
|
+
)
|
|
1231
|
+
betweenness_vals = np.array(
|
|
1232
|
+
[betweenness_centrality.get(i, 0.0) for i in range(n_nodes)]
|
|
1233
|
+
)
|
|
1234
|
+
|
|
1235
|
+
# Store in adata.obs (directly as numpy array)
|
|
1236
|
+
adata.obs["degree_centrality"] = degree_vals
|
|
1237
|
+
adata.obs["closeness_centrality"] = closeness_vals
|
|
1238
|
+
adata.obs["betweenness_centrality"] = betweenness_vals
|
|
1239
|
+
|
|
1240
|
+
# Compute statistics by cluster
|
|
1241
|
+
centrality_stats = {}
|
|
1242
|
+
for cluster in adata.obs[cluster_key].unique():
|
|
1243
|
+
mask = adata.obs[cluster_key] == cluster
|
|
1244
|
+
centrality_stats[str(cluster)] = {
|
|
1245
|
+
"mean_degree": float(adata.obs.loc[mask, "degree_centrality"].mean()),
|
|
1246
|
+
"mean_closeness": float(
|
|
1247
|
+
adata.obs.loc[mask, "closeness_centrality"].mean()
|
|
1248
|
+
),
|
|
1249
|
+
"mean_betweenness": float(
|
|
1250
|
+
adata.obs.loc[mask, "betweenness_centrality"].mean()
|
|
1251
|
+
),
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
return {
|
|
1255
|
+
"centrality_computed": True,
|
|
1256
|
+
"cluster_centrality": centrality_stats,
|
|
1257
|
+
"global_stats": {
|
|
1258
|
+
"mean_degree": float(np.mean(list(degree_centrality.values()))),
|
|
1259
|
+
"mean_closeness": float(np.mean(list(closeness_centrality.values()))),
|
|
1260
|
+
"mean_betweenness": float(
|
|
1261
|
+
np.mean(list(betweenness_centrality.values()))
|
|
1262
|
+
),
|
|
1263
|
+
},
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
except Exception as e:
|
|
1267
|
+
raise ProcessingError(f"Spatial centrality analysis failed: {e}") from e
|
|
1268
|
+
|
|
1269
|
+
|
|
1270
|
+
def _analyze_local_moran(
|
|
1271
|
+
adata: ad.AnnData,
|
|
1272
|
+
params: SpatialStatisticsParameters,
|
|
1273
|
+
ctx: "ToolContext",
|
|
1274
|
+
) -> dict[str, Any]:
|
|
1275
|
+
"""
|
|
1276
|
+
Calculate Local Moran's I (LISA) for spatial clustering detection.
|
|
1277
|
+
|
|
1278
|
+
Local Moran's I identifies spatial clusters and outliers by measuring
|
|
1279
|
+
the local spatial autocorrelation for each observation.
|
|
1280
|
+
|
|
1281
|
+
Parameters
|
|
1282
|
+
----------
|
|
1283
|
+
adata : ad.AnnData
|
|
1284
|
+
Annotated data object
|
|
1285
|
+
params : SpatialStatisticsParameters
|
|
1286
|
+
Analysis parameters including genes to analyze
|
|
1287
|
+
ctx : ToolContext
|
|
1288
|
+
ToolContext for logging and data access
|
|
1289
|
+
|
|
1290
|
+
Returns
|
|
1291
|
+
-------
|
|
1292
|
+
Dict[str, Any]
|
|
1293
|
+
Results including Local Moran's I values and statistics for each gene
|
|
1294
|
+
|
|
1295
|
+
Notes
|
|
1296
|
+
-----
|
|
1297
|
+
This implementation uses PySAL's esda.Moran_Local with permutation-based
|
|
1298
|
+
significance testing, following best practices from:
|
|
1299
|
+
- GeoDa Center: https://geodacenter.github.io/workbook/6a_local_auto/lab6a.html
|
|
1300
|
+
- PySAL documentation: https://pysal.org/esda/generated/esda.Moran_Local.html
|
|
1301
|
+
|
|
1302
|
+
The permutation approach holds each observation fixed while randomly permuting
|
|
1303
|
+
the remaining n-1 values to generate a reference distribution for significance
|
|
1304
|
+
testing. This is more robust than parametric approaches as it makes fewer
|
|
1305
|
+
distributional assumptions.
|
|
1306
|
+
|
|
1307
|
+
Quadrant classification (LISA clusters):
|
|
1308
|
+
- HH (High-High): Hot spots - high values surrounded by high values
|
|
1309
|
+
- LL (Low-Low): Cold spots - low values surrounded by low values
|
|
1310
|
+
- HL (High-Low): High outliers - high values surrounded by low values
|
|
1311
|
+
- LH (Low-High): Low outliers - low values surrounded by high values
|
|
1312
|
+
"""
|
|
1313
|
+
# Import PySAL components for proper LISA analysis
|
|
1314
|
+
require("esda") # Raises ImportError with install instructions if missing
|
|
1315
|
+
require("libpysal") # Raises ImportError with install instructions if missing
|
|
1316
|
+
from esda.moran import Moran_Local
|
|
1317
|
+
from libpysal.weights import W as PySALWeights
|
|
1318
|
+
|
|
1319
|
+
try:
|
|
1320
|
+
# Ensure spatial neighbors exist
|
|
1321
|
+
await ensure_spatial_neighbors_async(adata, ctx, n_neighs=params.n_neighbors)
|
|
1322
|
+
|
|
1323
|
+
# Unified gene selection (default 5 genes for computational efficiency)
|
|
1324
|
+
n_genes = (
|
|
1325
|
+
min(5, params.n_top_genes) if params.genes is None else params.n_top_genes
|
|
1326
|
+
)
|
|
1327
|
+
valid_genes = select_genes_for_analysis(
|
|
1328
|
+
adata,
|
|
1329
|
+
genes=params.genes,
|
|
1330
|
+
n_genes=n_genes,
|
|
1331
|
+
analysis_name="Local Moran's I (LISA)",
|
|
1332
|
+
)
|
|
1333
|
+
|
|
1334
|
+
# Convert spatial connectivity matrix to PySAL weights format
|
|
1335
|
+
W_sparse = adata.obsp["spatial_connectivities"]
|
|
1336
|
+
|
|
1337
|
+
# Create PySAL weights from sparse matrix using optimized CSR access
|
|
1338
|
+
# Direct CSR array access avoids per-row object creation (15x faster)
|
|
1339
|
+
from scipy.sparse import csr_matrix
|
|
1340
|
+
|
|
1341
|
+
if not isinstance(W_sparse, csr_matrix):
|
|
1342
|
+
W_sparse = csr_matrix(W_sparse)
|
|
1343
|
+
|
|
1344
|
+
neighbors_dict = {}
|
|
1345
|
+
weights_dict = {}
|
|
1346
|
+
n_obs = W_sparse.shape[0]
|
|
1347
|
+
|
|
1348
|
+
# Direct access to CSR internal arrays
|
|
1349
|
+
indptr = W_sparse.indptr
|
|
1350
|
+
indices = W_sparse.indices
|
|
1351
|
+
data = W_sparse.data
|
|
1352
|
+
|
|
1353
|
+
for i in range(n_obs):
|
|
1354
|
+
start, end = indptr[i], indptr[i + 1]
|
|
1355
|
+
neighbors_dict[i] = indices[start:end].tolist()
|
|
1356
|
+
weights_dict[i] = data[start:end].tolist()
|
|
1357
|
+
|
|
1358
|
+
w = PySALWeights(neighbors_dict, weights_dict)
|
|
1359
|
+
|
|
1360
|
+
# Get analysis parameters
|
|
1361
|
+
permutations = params.local_moran_permutations
|
|
1362
|
+
alpha = params.local_moran_alpha
|
|
1363
|
+
use_fdr = params.local_moran_fdr_correction
|
|
1364
|
+
|
|
1365
|
+
# Memory-efficient streaming: extract one gene at a time
|
|
1366
|
+
# This reduces memory from O(n_spots × n_genes) to O(n_spots)
|
|
1367
|
+
# Critical for large datasets (Visium HD: 50K+ spots × 500 genes = 200MB+)
|
|
1368
|
+
results = {}
|
|
1369
|
+
for gene in valid_genes:
|
|
1370
|
+
# Extract single gene column - memory efficient for sparse matrices
|
|
1371
|
+
gene_idx = adata.var_names.get_loc(gene)
|
|
1372
|
+
expr = to_dense(adata.X[:, gene_idx]).flatten()
|
|
1373
|
+
|
|
1374
|
+
# CRITICAL: Convert to float64 for PySAL/numba compatibility
|
|
1375
|
+
# PySAL's Moran_Local uses numba JIT compilation which requires
|
|
1376
|
+
# consistent dtypes (float64) for matrix operations
|
|
1377
|
+
expr = expr.astype(np.float64, copy=False)
|
|
1378
|
+
|
|
1379
|
+
# Run PySAL Local Moran's I with permutation testing
|
|
1380
|
+
lisa = Moran_Local(expr, w, permutations=permutations)
|
|
1381
|
+
|
|
1382
|
+
# Store local I values in adata.obs
|
|
1383
|
+
adata.obs[f"{gene}_local_morans"] = lisa.Is
|
|
1384
|
+
|
|
1385
|
+
# Get p-values from permutation test
|
|
1386
|
+
p_values = lisa.p_sim
|
|
1387
|
+
|
|
1388
|
+
# Apply FDR correction if requested
|
|
1389
|
+
if use_fdr and permutations > 0:
|
|
1390
|
+
# Check statsmodels availability for FDR correction
|
|
1391
|
+
require(
|
|
1392
|
+
"statsmodels"
|
|
1393
|
+
) # Raises ImportError with install instructions if missing
|
|
1394
|
+
from statsmodels.stats.multitest import multipletests
|
|
1395
|
+
|
|
1396
|
+
_, p_corrected, _, _ = multipletests(
|
|
1397
|
+
p_values, alpha=alpha, method="fdr_bh"
|
|
1398
|
+
)
|
|
1399
|
+
significant = p_corrected < alpha
|
|
1400
|
+
else:
|
|
1401
|
+
significant = p_values < alpha
|
|
1402
|
+
|
|
1403
|
+
# Classify by quadrant AND significance
|
|
1404
|
+
# PySAL quadrant codes: 1=HH, 2=LH, 3=LL, 4=HL
|
|
1405
|
+
q = lisa.q
|
|
1406
|
+
|
|
1407
|
+
# Hot spots: High-High clusters (significant positive spatial autocorrelation)
|
|
1408
|
+
hotspots = np.where((q == 1) & significant)[0].tolist()
|
|
1409
|
+
# Cold spots: Low-Low clusters (significant positive spatial autocorrelation)
|
|
1410
|
+
coldspots = np.where((q == 3) & significant)[0].tolist()
|
|
1411
|
+
# High outliers: High values surrounded by low values
|
|
1412
|
+
high_outliers = np.where((q == 4) & significant)[0].tolist()
|
|
1413
|
+
# Low outliers: Low values surrounded by high values
|
|
1414
|
+
low_outliers = np.where((q == 2) & significant)[0].tolist()
|
|
1415
|
+
|
|
1416
|
+
# Store quadrant classification in adata.obs
|
|
1417
|
+
quadrant_labels = np.array(["Not Significant"] * n_obs)
|
|
1418
|
+
quadrant_labels[(q == 1) & significant] = "HH (Hot Spot)"
|
|
1419
|
+
quadrant_labels[(q == 3) & significant] = "LL (Cold Spot)"
|
|
1420
|
+
quadrant_labels[(q == 4) & significant] = "HL (High Outlier)"
|
|
1421
|
+
quadrant_labels[(q == 2) & significant] = "LH (Low Outlier)"
|
|
1422
|
+
adata.obs[f"{gene}_lisa_cluster"] = pd.Categorical(quadrant_labels)
|
|
1423
|
+
|
|
1424
|
+
# Store p-values
|
|
1425
|
+
adata.obs[f"{gene}_lisa_pvalue"] = p_values
|
|
1426
|
+
|
|
1427
|
+
results[gene] = {
|
|
1428
|
+
"mean_I": float(np.mean(lisa.Is)),
|
|
1429
|
+
"std_I": float(np.std(lisa.Is)),
|
|
1430
|
+
"min_I": float(np.min(lisa.Is)),
|
|
1431
|
+
"max_I": float(np.max(lisa.Is)),
|
|
1432
|
+
"n_significant": int(np.sum(significant)),
|
|
1433
|
+
"n_hotspots": len(hotspots), # HH clusters
|
|
1434
|
+
"n_coldspots": len(coldspots), # LL clusters
|
|
1435
|
+
"n_high_outliers": len(high_outliers), # HL
|
|
1436
|
+
"n_low_outliers": len(low_outliers), # LH
|
|
1437
|
+
"permutations": permutations,
|
|
1438
|
+
"alpha": alpha,
|
|
1439
|
+
"fdr_corrected": use_fdr,
|
|
1440
|
+
}
|
|
1441
|
+
|
|
1442
|
+
# Store summary in uns
|
|
1443
|
+
adata.uns["local_moran"] = {
|
|
1444
|
+
"genes_analyzed": valid_genes,
|
|
1445
|
+
"n_neighbors": params.n_neighbors,
|
|
1446
|
+
"permutations": permutations,
|
|
1447
|
+
"alpha": alpha,
|
|
1448
|
+
"fdr_corrected": use_fdr,
|
|
1449
|
+
"results": results,
|
|
1450
|
+
"method": "PySAL esda.Moran_Local",
|
|
1451
|
+
"reference": "Anselin, L. (1995). Local Indicators of Spatial Association - LISA",
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
return {
|
|
1455
|
+
"analysis_type": "local_moran",
|
|
1456
|
+
"genes_analyzed": valid_genes,
|
|
1457
|
+
"results": results,
|
|
1458
|
+
"parameters": {
|
|
1459
|
+
"permutations": permutations,
|
|
1460
|
+
"alpha": alpha,
|
|
1461
|
+
"fdr_corrected": use_fdr,
|
|
1462
|
+
"n_neighbors": params.n_neighbors,
|
|
1463
|
+
},
|
|
1464
|
+
"interpretation": (
|
|
1465
|
+
"LISA (Local Indicators of Spatial Association) identifies statistically "
|
|
1466
|
+
"significant spatial clusters and outliers using permutation-based testing. "
|
|
1467
|
+
"HH (Hot Spots): high values clustered together. "
|
|
1468
|
+
"LL (Cold Spots): low values clustered together. "
|
|
1469
|
+
"HL/LH (Outliers): values significantly different from neighbors. "
|
|
1470
|
+
f"Significance determined by {permutations} permutations "
|
|
1471
|
+
f"with alpha={alpha}{' and FDR correction' if use_fdr else ''}."
|
|
1472
|
+
),
|
|
1473
|
+
}
|
|
1474
|
+
|
|
1475
|
+
except Exception as e:
|
|
1476
|
+
raise ProcessingError(f"Local Moran's I analysis failed: {e}") from e
|