chatspatial 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. chatspatial/__init__.py +11 -0
  2. chatspatial/__main__.py +141 -0
  3. chatspatial/cli/__init__.py +7 -0
  4. chatspatial/config.py +53 -0
  5. chatspatial/models/__init__.py +85 -0
  6. chatspatial/models/analysis.py +513 -0
  7. chatspatial/models/data.py +2462 -0
  8. chatspatial/server.py +1763 -0
  9. chatspatial/spatial_mcp_adapter.py +720 -0
  10. chatspatial/tools/__init__.py +3 -0
  11. chatspatial/tools/annotation.py +1903 -0
  12. chatspatial/tools/cell_communication.py +1603 -0
  13. chatspatial/tools/cnv_analysis.py +605 -0
  14. chatspatial/tools/condition_comparison.py +595 -0
  15. chatspatial/tools/deconvolution/__init__.py +402 -0
  16. chatspatial/tools/deconvolution/base.py +318 -0
  17. chatspatial/tools/deconvolution/card.py +244 -0
  18. chatspatial/tools/deconvolution/cell2location.py +326 -0
  19. chatspatial/tools/deconvolution/destvi.py +144 -0
  20. chatspatial/tools/deconvolution/flashdeconv.py +101 -0
  21. chatspatial/tools/deconvolution/rctd.py +317 -0
  22. chatspatial/tools/deconvolution/spotlight.py +216 -0
  23. chatspatial/tools/deconvolution/stereoscope.py +109 -0
  24. chatspatial/tools/deconvolution/tangram.py +135 -0
  25. chatspatial/tools/differential.py +625 -0
  26. chatspatial/tools/embeddings.py +298 -0
  27. chatspatial/tools/enrichment.py +1863 -0
  28. chatspatial/tools/integration.py +807 -0
  29. chatspatial/tools/preprocessing.py +723 -0
  30. chatspatial/tools/spatial_domains.py +808 -0
  31. chatspatial/tools/spatial_genes.py +836 -0
  32. chatspatial/tools/spatial_registration.py +441 -0
  33. chatspatial/tools/spatial_statistics.py +1476 -0
  34. chatspatial/tools/trajectory.py +495 -0
  35. chatspatial/tools/velocity.py +405 -0
  36. chatspatial/tools/visualization/__init__.py +155 -0
  37. chatspatial/tools/visualization/basic.py +393 -0
  38. chatspatial/tools/visualization/cell_comm.py +699 -0
  39. chatspatial/tools/visualization/cnv.py +320 -0
  40. chatspatial/tools/visualization/core.py +684 -0
  41. chatspatial/tools/visualization/deconvolution.py +852 -0
  42. chatspatial/tools/visualization/enrichment.py +660 -0
  43. chatspatial/tools/visualization/integration.py +205 -0
  44. chatspatial/tools/visualization/main.py +164 -0
  45. chatspatial/tools/visualization/multi_gene.py +739 -0
  46. chatspatial/tools/visualization/persistence.py +335 -0
  47. chatspatial/tools/visualization/spatial_stats.py +469 -0
  48. chatspatial/tools/visualization/trajectory.py +639 -0
  49. chatspatial/tools/visualization/velocity.py +411 -0
  50. chatspatial/utils/__init__.py +115 -0
  51. chatspatial/utils/adata_utils.py +1372 -0
  52. chatspatial/utils/compute.py +327 -0
  53. chatspatial/utils/data_loader.py +499 -0
  54. chatspatial/utils/dependency_manager.py +462 -0
  55. chatspatial/utils/device_utils.py +165 -0
  56. chatspatial/utils/exceptions.py +185 -0
  57. chatspatial/utils/image_utils.py +267 -0
  58. chatspatial/utils/mcp_utils.py +137 -0
  59. chatspatial/utils/path_utils.py +243 -0
  60. chatspatial/utils/persistence.py +78 -0
  61. chatspatial/utils/scipy_compat.py +143 -0
  62. chatspatial-1.1.0.dist-info/METADATA +242 -0
  63. chatspatial-1.1.0.dist-info/RECORD +67 -0
  64. chatspatial-1.1.0.dist-info/WHEEL +5 -0
  65. chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
  66. chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
  67. chatspatial-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1476 @@
1
+ """
2
+ A module for quantitative spatial analysis of spatial transcriptomics data.
3
+
4
+ This module provides a collection of functions to compute various spatial
5
+ statistics. It includes methods for assessing global and local spatial
6
+ autocorrelation, analyzing neighborhood compositions, and evaluating spatial
7
+ patterns of cell clusters.
8
+
9
+ Key functionalities include:
10
+ - Global spatial autocorrelation (Moran's I, Geary's C).
11
+ - Local spatial autocorrelation (Local Moran's I / LISA for cluster detection).
12
+ - Local spatial statistics for hotspot detection (Getis-Ord Gi*).
13
+ - Cluster-based analysis (Neighborhood Enrichment, Co-occurrence, Ripley's K).
14
+ - Spatial network analysis (Centrality Scores, Network Properties).
15
+ - Bivariate spatial correlation analysis (Bivariate Moran's I).
16
+ - Categorical spatial analysis (Join Count statistics).
17
+ - Spatial centrality measures for tissue architecture.
18
+
19
+ The primary entry point is the `analyze_spatial_statistics` function, which
20
+ dispatches tasks to the appropriate analysis function based on user parameters.
21
+ All 12 analysis types are accessible through this unified interface with a
22
+ new unified 'genes' parameter for consistent gene selection across methods.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from typing import TYPE_CHECKING, Any, Optional
28
+
29
+ import anndata as ad
30
+ import numpy as np
31
+ import pandas as pd
32
+ import squidpy as sq
33
+
34
+ from ..utils.dependency_manager import is_available, require
35
+
36
+ if TYPE_CHECKING:
37
+ from ..spatial_mcp_adapter import ToolContext
38
+
39
+ from ..models.analysis import SpatialStatisticsResult
40
+ from ..models.data import SpatialStatisticsParameters
41
+ from ..utils.adata_utils import (
42
+ ensure_categorical,
43
+ require_spatial_coords,
44
+ select_genes_for_analysis,
45
+ to_dense,
46
+ validate_adata_basics,
47
+ )
48
+ from ..utils.compute import ensure_spatial_neighbors_async
49
+ from ..utils.exceptions import (
50
+ DataCompatibilityError,
51
+ DataNotFoundError,
52
+ DependencyError,
53
+ ParameterError,
54
+ ProcessingError,
55
+ )
56
+
57
+ # ============================================================================
58
+ # MAIN ENTRY POINT
59
+ # ============================================================================
60
+
61
+
62
+ async def analyze_spatial_statistics(
63
+ data_id: str,
64
+ ctx: ToolContext,
65
+ params: SpatialStatisticsParameters, # No default - must be provided by caller (LLM)
66
+ ) -> SpatialStatisticsResult:
67
+ """
68
+ Serves as the central dispatcher for executing various spatial analysis methods.
69
+
70
+ This function validates the input data, computes a spatial neighbor graph if one
71
+ does not exist, and routes the analysis to the appropriate specialized function
72
+ based on the `analysis_type` parameter. Results from the analysis are added to
73
+ the `AnnData` object within the data store. Note that visualization is handled
74
+ by a separate function.
75
+
76
+ Parameters
77
+ ----------
78
+ data_id : str
79
+ The identifier for the dataset.
80
+ ctx : ToolContext
81
+ Tool context for data access and logging.
82
+ params : SpatialStatisticsParameters
83
+ An object containing the parameters for the analysis, including the
84
+ specific `analysis_type` to perform.
85
+
86
+ Returns
87
+ -------
88
+ SpatialStatisticsResult
89
+ An object containing the statistical results and metadata from the analysis.
90
+
91
+ Raises
92
+ ------
93
+ DataNotFoundError
94
+ If the specified dataset is not found in the data store.
95
+ ParameterError
96
+ If the provided parameters are not valid for the requested analysis.
97
+ ProcessingError
98
+ If an error occurs during the execution of the analysis.
99
+ """
100
+ # Validate parameters
101
+ supported_types = [
102
+ "neighborhood",
103
+ "co_occurrence",
104
+ "ripley",
105
+ "moran",
106
+ "local_moran", # Added Local Moran's I
107
+ "geary",
108
+ "centrality",
109
+ "getis_ord",
110
+ "bivariate_moran",
111
+ "join_count", # Traditional Join Count for binary data (2 categories)
112
+ "local_join_count", # Local Join Count for multi-category data (>2 categories)
113
+ "network_properties",
114
+ "spatial_centrality",
115
+ ]
116
+
117
+ if params.analysis_type not in supported_types:
118
+ raise ParameterError(f"Unsupported analysis type: {params.analysis_type}")
119
+
120
+ if params.n_neighbors <= 0:
121
+ raise ParameterError(f"n_neighbors must be positive, got {params.n_neighbors}")
122
+
123
+ # Retrieve dataset via ToolContext
124
+ try:
125
+ adata = await ctx.get_adata(data_id)
126
+
127
+ # Basic validation: min 10 cells, spatial coordinates exist
128
+ validate_adata_basics(adata, min_obs=10)
129
+ require_spatial_coords(adata) # Validates spatial coords exist
130
+
131
+ # Determine if cluster_key is required for this analysis type
132
+ analyses_requiring_cluster_key = {
133
+ "neighborhood",
134
+ "co_occurrence",
135
+ "ripley",
136
+ "join_count",
137
+ "local_join_count",
138
+ "centrality",
139
+ "network_properties",
140
+ "spatial_centrality",
141
+ }
142
+
143
+ # Ensure cluster key only for analyses that require it
144
+ cluster_key = None
145
+ if params.analysis_type in analyses_requiring_cluster_key:
146
+ if params.cluster_key not in adata.obs.columns:
147
+ available = [
148
+ c
149
+ for c in adata.obs.columns
150
+ if "cluster" in c.lower() or c in ["leiden", "louvain"]
151
+ ]
152
+ raise DataNotFoundError(
153
+ f"Cluster key '{params.cluster_key}' not found. "
154
+ f"Available: {available if available else 'None'}. "
155
+ f"Run preprocess_data() first to generate clusters."
156
+ )
157
+ ensure_categorical(adata, params.cluster_key)
158
+ cluster_key = params.cluster_key
159
+
160
+ # Ensure spatial neighbors
161
+ await ensure_spatial_neighbors_async(adata, ctx, n_neighs=params.n_neighbors)
162
+
163
+ # Route to appropriate analysis function
164
+ if params.analysis_type == "moran":
165
+ result = _analyze_morans_i(adata, params, ctx)
166
+ elif params.analysis_type == "local_moran":
167
+ result = _analyze_local_moran(adata, params, ctx)
168
+ elif params.analysis_type == "geary":
169
+ result = _analyze_gearys_c(adata, params, ctx)
170
+ elif params.analysis_type == "neighborhood":
171
+ result = _analyze_neighborhood_enrichment(adata, cluster_key, ctx)
172
+ elif params.analysis_type == "co_occurrence":
173
+ result = _analyze_co_occurrence(adata, cluster_key, ctx)
174
+ elif params.analysis_type == "ripley":
175
+ result = _analyze_ripleys_k(adata, cluster_key, ctx)
176
+ elif params.analysis_type == "getis_ord":
177
+ result = _analyze_getis_ord(adata, params, ctx)
178
+ elif params.analysis_type == "centrality":
179
+ result = _analyze_centrality(adata, cluster_key, ctx)
180
+ elif params.analysis_type == "bivariate_moran":
181
+ result = _analyze_bivariate_moran(adata, params, ctx)
182
+ elif params.analysis_type == "join_count":
183
+ result = _analyze_join_count(adata, cluster_key, params, ctx)
184
+ elif params.analysis_type == "local_join_count":
185
+ result = _analyze_local_join_count(adata, cluster_key, params, ctx)
186
+ elif params.analysis_type == "network_properties":
187
+ result = _analyze_network_properties(adata, cluster_key, params, ctx)
188
+ elif params.analysis_type == "spatial_centrality":
189
+ result = _analyze_spatial_centrality(adata, cluster_key, params, ctx)
190
+ else:
191
+ raise ParameterError(
192
+ f"Analysis type {params.analysis_type} not implemented"
193
+ )
194
+
195
+ # COW FIX: No need to update data_store - changes already reflected via direct reference
196
+ # All modifications to adata.obs/uns/obsp are in-place and preserved
197
+
198
+ # Ensure result is a dictionary
199
+ if not isinstance(result, dict):
200
+ if hasattr(result, "dict"):
201
+ result = result.dict()
202
+ else:
203
+ raise ProcessingError("Invalid result format from analysis function")
204
+
205
+ # Add metadata
206
+ result.update(
207
+ {
208
+ "n_cells": adata.n_obs,
209
+ "n_neighbors": params.n_neighbors,
210
+ }
211
+ )
212
+
213
+ # Store scientific metadata for reproducibility
214
+ from ..utils.adata_utils import store_analysis_metadata
215
+
216
+ # Determine results keys based on analysis type
217
+ results_keys_dict = {"obs": [], "var": [], "obsm": [], "uns": []}
218
+ if params.analysis_type in ["moran", "geary"]:
219
+ results_keys_dict["uns"].append(f"{params.analysis_type}s_i")
220
+ elif params.analysis_type == "local_moran":
221
+ results_keys_dict["obs"].extend(
222
+ [f"{gene}_local_moran" for gene in (params.genes or [])]
223
+ )
224
+ elif params.analysis_type == "getis_ord":
225
+ if params.genes:
226
+ for gene in params.genes:
227
+ results_keys_dict["obs"].extend(
228
+ [f"{gene}_getis_ord_z", f"{gene}_getis_ord_p"]
229
+ )
230
+ elif params.analysis_type in ["neighborhood", "co_occurrence"]:
231
+ results_keys_dict["uns"].append(params.analysis_type)
232
+ elif params.analysis_type == "ripley":
233
+ results_keys_dict["uns"].append("ripley")
234
+ elif params.analysis_type == "centrality":
235
+ results_keys_dict["uns"].append("centrality_scores")
236
+
237
+ # Prepare parameters dict
238
+ parameters_dict = {
239
+ "n_neighbors": params.n_neighbors,
240
+ }
241
+ if cluster_key:
242
+ parameters_dict["cluster_key"] = cluster_key
243
+ if params.genes:
244
+ parameters_dict["genes"] = params.genes
245
+ # Add n_perms based on analysis type
246
+ if params.analysis_type in ["moran", "local_moran", "geary"]:
247
+ parameters_dict["n_perms"] = params.moran_n_perms
248
+
249
+ # Extract statistics for metadata
250
+ statistics_dict = {
251
+ "n_cells": adata.n_obs,
252
+ }
253
+ if "n_significant" in result:
254
+ statistics_dict["n_significant"] = result["n_significant"]
255
+ if "mean_score" in result:
256
+ statistics_dict["mean_score"] = result["mean_score"]
257
+
258
+ # Store metadata
259
+ store_analysis_metadata(
260
+ adata,
261
+ analysis_name=f"spatial_stats_{params.analysis_type}",
262
+ method=params.analysis_type,
263
+ parameters=parameters_dict,
264
+ results_keys=results_keys_dict,
265
+ statistics=statistics_dict,
266
+ )
267
+
268
+ # Extract summary fields for MCP response (detailed statistics excluded)
269
+ summary = _extract_result_summary(result, params.analysis_type)
270
+
271
+ return SpatialStatisticsResult(
272
+ data_id=data_id,
273
+ analysis_type=params.analysis_type,
274
+ n_features_analyzed=summary["n_features_analyzed"],
275
+ n_significant=summary["n_significant"],
276
+ top_features=summary["top_features"],
277
+ summary_metrics=summary["summary_metrics"],
278
+ results_key=summary.get("results_key"),
279
+ statistics=result, # Excluded from MCP response via Field(exclude=True)
280
+ )
281
+
282
+ except (DataNotFoundError, ParameterError, DataCompatibilityError):
283
+ raise
284
+ except Exception as e:
285
+ raise ProcessingError(
286
+ f"Error in {params.analysis_type} analysis: {e}"
287
+ ) from e
288
+
289
+
290
+ # ============================================================================
291
+ # HELPER FUNCTIONS
292
+ # ============================================================================
293
+
294
+
295
+ def _extract_result_summary(
296
+ result: dict[str, Any], analysis_type: str
297
+ ) -> dict[str, Any]:
298
+ """Extract compact summary from analysis result for MCP response.
299
+
300
+ This function extracts the most informative fields from detailed analysis results,
301
+ keeping the MCP response small while preserving actionable insights for the LLM.
302
+
303
+ Args:
304
+ result: Full result dictionary from analysis function
305
+ analysis_type: Type of spatial analysis performed
306
+
307
+ Returns:
308
+ Dictionary with standardized summary fields:
309
+ - n_features_analyzed: Number of genes/clusters analyzed
310
+ - n_significant: Number of significant results
311
+ - top_features: List of top significant features (max 10)
312
+ - summary_metrics: Key numeric metrics
313
+ - results_key: Key in adata.uns for full results (if applicable)
314
+ """
315
+ summary: dict[str, Any] = {
316
+ "n_features_analyzed": 0,
317
+ "n_significant": 0,
318
+ "top_features": [],
319
+ "summary_metrics": {},
320
+ "results_key": None,
321
+ }
322
+
323
+ # Extract based on analysis type
324
+ if analysis_type == "moran":
325
+ summary["n_features_analyzed"] = result.get("n_genes_analyzed", 0)
326
+ summary["n_significant"] = result.get("n_significant", 0)
327
+ summary["top_features"] = result.get("top_highest_autocorrelation", [])[:10]
328
+ summary["summary_metrics"] = {"mean_morans_i": result.get("mean_morans_i", 0.0)}
329
+ summary["results_key"] = result.get("analysis_key")
330
+
331
+ elif analysis_type == "geary":
332
+ summary["n_features_analyzed"] = result.get("n_genes_analyzed", 0)
333
+ summary["summary_metrics"] = {"mean_gearys_c": result.get("mean_gearys_c", 0.0)}
334
+ summary["results_key"] = result.get("analysis_key")
335
+
336
+ elif analysis_type == "local_moran":
337
+ summary["n_features_analyzed"] = result.get("n_genes_analyzed", 0)
338
+ summary["n_significant"] = result.get("n_significant_total", 0)
339
+ summary["top_features"] = result.get("top_clustered_genes", [])[:10]
340
+ summary["summary_metrics"] = {
341
+ "mean_hotspots": result.get("mean_hotspots_per_gene", 0.0),
342
+ "mean_coldspots": result.get("mean_coldspots_per_gene", 0.0),
343
+ }
344
+
345
+ elif analysis_type == "getis_ord":
346
+ genes_analyzed = result.get("genes_analyzed", [])
347
+ summary["n_features_analyzed"] = len(genes_analyzed)
348
+ summary["top_features"] = genes_analyzed[:10]
349
+ # Count total hotspots across all genes
350
+ per_gene_results = result.get("results", {})
351
+ total_hot = sum(r.get("n_hot_spots", 0) for r in per_gene_results.values())
352
+ total_cold = sum(r.get("n_cold_spots", 0) for r in per_gene_results.values())
353
+ summary["summary_metrics"] = {
354
+ "total_hotspots": total_hot,
355
+ "total_coldspots": total_cold,
356
+ }
357
+
358
+ elif analysis_type == "neighborhood":
359
+ summary["n_features_analyzed"] = result.get("n_clusters", 0)
360
+ summary["summary_metrics"] = {
361
+ "max_enrichment": result.get("max_enrichment", 0.0),
362
+ "min_enrichment": result.get("min_enrichment", 0.0),
363
+ }
364
+ summary["results_key"] = result.get("analysis_key")
365
+
366
+ elif analysis_type == "co_occurrence":
367
+ summary["n_features_analyzed"] = result.get("n_clusters", 0)
368
+ summary["results_key"] = result.get("analysis_key")
369
+
370
+ elif analysis_type == "ripley":
371
+ summary["n_features_analyzed"] = result.get("n_clusters", 0)
372
+ summary["results_key"] = result.get("analysis_key")
373
+
374
+ elif analysis_type == "centrality":
375
+ summary["n_features_analyzed"] = result.get("n_clusters", 0)
376
+ summary["results_key"] = result.get("analysis_key")
377
+
378
+ elif analysis_type == "bivariate_moran":
379
+ pairs = result.get("gene_pairs", [])
380
+ summary["n_features_analyzed"] = len(pairs)
381
+ summary["top_features"] = [f"{p[0]}-{p[1]}" for p in pairs[:10]]
382
+ # Extract significant correlations
383
+ per_pair = result.get("results", {})
384
+ significant = [k for k, v in per_pair.items() if abs(v.get("moran_i", 0)) > 0.3]
385
+ summary["n_significant"] = len(significant)
386
+
387
+ elif analysis_type in ["join_count", "local_join_count"]:
388
+ summary["n_features_analyzed"] = result.get("n_categories", 0)
389
+ summary["n_significant"] = result.get("n_significant", 0)
390
+ summary["results_key"] = result.get("analysis_key")
391
+
392
+ elif analysis_type in ["network_properties", "spatial_centrality"]:
393
+ summary["results_key"] = result.get("analysis_key")
394
+ summary["summary_metrics"] = {
395
+ k: v
396
+ for k, v in result.items()
397
+ if isinstance(v, (int, float)) and k not in ("n_cells", "n_neighbors")
398
+ }
399
+
400
+ return summary
401
+
402
+
403
+ def _get_optimal_n_jobs(n_obs: int, requested_n_jobs: Optional[int] = None) -> int:
404
+ """Determine optimal number of parallel jobs based on data size."""
405
+ import os
406
+
407
+ if requested_n_jobs is not None:
408
+ if requested_n_jobs == -1:
409
+ return os.cpu_count() or 1
410
+ return requested_n_jobs
411
+
412
+ # Smart defaults based on data size
413
+ if n_obs < 1000:
414
+ return 1 # Single thread for small data
415
+ elif n_obs < 5000:
416
+ return min(2, os.cpu_count() or 1)
417
+ else:
418
+ return min(4, os.cpu_count() or 1)
419
+
420
+
421
+ # ============================================================================
422
+ # CORE ANALYSIS FUNCTIONS
423
+ # ============================================================================
424
+
425
+
426
+ def _analyze_morans_i(
427
+ adata: ad.AnnData,
428
+ params: SpatialStatisticsParameters,
429
+ ctx: "ToolContext",
430
+ ) -> dict[str, Any]:
431
+ """
432
+ Calculates Moran's I to measure global spatial autocorrelation for genes.
433
+
434
+ Moran's I is a statistic that indicates whether the expression of a gene is
435
+ spatially clustered, dispersed, or randomly distributed.
436
+ - A value near +1.0 indicates strong clustering of similar expression values.
437
+ - A value near -1.0 indicates dispersion (a checkerboard-like pattern).
438
+ - A value near 0 indicates a random spatial distribution.
439
+
440
+ The analysis is performed on highly variable genes by default, but a
441
+ specific gene list can be provided.
442
+ """
443
+ # Unified gene selection
444
+ genes = select_genes_for_analysis(
445
+ adata,
446
+ genes=params.genes,
447
+ n_genes=params.n_top_genes,
448
+ analysis_name="Moran's I",
449
+ )
450
+
451
+ # Optimize parallelization
452
+ n_jobs = _get_optimal_n_jobs(adata.n_obs, params.n_jobs)
453
+
454
+ # Run spatial autocorrelation
455
+ sq.gr.spatial_autocorr(
456
+ adata,
457
+ mode="moran",
458
+ genes=genes,
459
+ n_perms=params.moran_n_perms,
460
+ two_tailed=params.moran_two_tailed,
461
+ n_jobs=n_jobs,
462
+ backend=params.backend,
463
+ show_progress_bar=False,
464
+ )
465
+
466
+ # Extract results
467
+ moran_key = "moranI"
468
+ if moran_key in adata.uns:
469
+ results_df = adata.uns[moran_key]
470
+
471
+ # Get top significant genes
472
+ significant_genes = results_df[results_df["pval_norm"] < 0.05].index.tolist()
473
+
474
+ # Calculate appropriate number of top genes to return
475
+ # To avoid returning identical lists, we take at most half of the analyzed genes
476
+ # This ensures top_highest and top_lowest are different gene sets
477
+ n_analyzed = len(results_df)
478
+ n_top = min(10, max(3, n_analyzed // 2))
479
+
480
+ # Ensure we never return more than half the genes to avoid duplicates
481
+ n_top = min(n_top, n_analyzed // 2) if n_analyzed >= 6 else 0
482
+
483
+ return {
484
+ "n_genes_analyzed": len(genes),
485
+ "n_significant": len(significant_genes),
486
+ "top_highest_autocorrelation": (
487
+ results_df.nlargest(n_top, "I").index.tolist() if n_top > 0 else []
488
+ ),
489
+ "top_lowest_autocorrelation": (
490
+ results_df.nsmallest(n_top, "I").index.tolist() if n_top > 0 else []
491
+ ),
492
+ "mean_morans_i": float(results_df["I"].mean()),
493
+ "analysis_key": moran_key,
494
+ "note": "top_highest/top_lowest refer to autocorrelation strength, not positive/negative correlation",
495
+ }
496
+
497
+ raise ProcessingError("Moran's I computation did not produce results")
498
+
499
+
500
+ def _analyze_gearys_c(
501
+ adata: ad.AnnData,
502
+ params: SpatialStatisticsParameters,
503
+ ctx: "ToolContext",
504
+ ) -> dict[str, Any]:
505
+ """Compute Geary's C spatial autocorrelation."""
506
+ # Unified gene selection
507
+ genes = select_genes_for_analysis(
508
+ adata,
509
+ genes=params.genes,
510
+ n_genes=params.n_top_genes,
511
+ analysis_name="Geary's C",
512
+ )
513
+
514
+ sq.gr.spatial_autocorr(
515
+ adata,
516
+ mode="geary",
517
+ genes=genes,
518
+ n_perms=params.moran_n_perms,
519
+ n_jobs=_get_optimal_n_jobs(adata.n_obs, params.n_jobs),
520
+ show_progress_bar=False,
521
+ )
522
+
523
+ # Extract results (squidpy returns DataFrame, not dict)
524
+ geary_key = "gearyC"
525
+ if geary_key in adata.uns:
526
+ import pandas as pd
527
+
528
+ results_df = adata.uns[geary_key]
529
+ if isinstance(results_df, pd.DataFrame):
530
+ return {
531
+ "n_genes_analyzed": len(genes),
532
+ "mean_gearys_c": float(results_df["C"].mean()),
533
+ "analysis_key": geary_key,
534
+ }
535
+
536
+ raise ProcessingError("Geary's C computation did not produce results")
537
+
538
+
539
+ def _analyze_neighborhood_enrichment(
540
+ adata: ad.AnnData,
541
+ cluster_key: str,
542
+ ctx: "ToolContext",
543
+ ) -> dict[str, Any]:
544
+ """Compute neighborhood enrichment analysis."""
545
+ sq.gr.nhood_enrichment(adata, cluster_key=cluster_key)
546
+
547
+ analysis_key = f"{cluster_key}_nhood_enrichment"
548
+ if analysis_key in adata.uns:
549
+ z_scores = adata.uns[analysis_key]["zscore"]
550
+
551
+ # Use nanmax/nanmin to handle NaN values from sparse cell type distributions
552
+ # NaN can occur when certain cell type pairs have insufficient neighborhoods
553
+ return {
554
+ "n_clusters": len(z_scores),
555
+ "max_enrichment": float(np.nanmax(z_scores)),
556
+ "min_enrichment": float(np.nanmin(z_scores)),
557
+ "analysis_key": analysis_key,
558
+ }
559
+
560
+ raise ProcessingError("Neighborhood enrichment did not produce results")
561
+
562
+
563
+ def _analyze_co_occurrence(
564
+ adata: ad.AnnData,
565
+ cluster_key: str,
566
+ ctx: "ToolContext",
567
+ ) -> dict[str, Any]:
568
+ """Compute co-occurrence analysis."""
569
+ sq.gr.co_occurrence(adata, cluster_key=cluster_key)
570
+
571
+ analysis_key = f"{cluster_key}_co_occurrence"
572
+ if analysis_key in adata.uns:
573
+ co_occurrence = adata.uns[analysis_key]["occ"]
574
+
575
+ return {"n_clusters": len(co_occurrence), "analysis_key": analysis_key}
576
+
577
+ raise ProcessingError("Co-occurrence analysis did not produce results")
578
+
579
+
580
+ def _analyze_ripleys_k(
581
+ adata: ad.AnnData,
582
+ cluster_key: str,
583
+ ctx: "ToolContext",
584
+ ) -> dict[str, Any]:
585
+ """Compute Ripley's K function."""
586
+ try:
587
+ sq.gr.ripley(
588
+ adata,
589
+ cluster_key=cluster_key,
590
+ mode="L", # L-function (variance-stabilized)
591
+ n_simulations=20,
592
+ n_observations=min(1000, adata.n_obs),
593
+ max_dist=None,
594
+ n_steps=50,
595
+ )
596
+
597
+ analysis_key = f"{cluster_key}_ripley_L"
598
+ return {"analysis_completed": True, "analysis_key": analysis_key}
599
+ except Exception as e:
600
+ raise ProcessingError(f"Ripley's K analysis failed: {e}") from e
601
+
602
+
603
+ def _analyze_getis_ord(
604
+ adata: ad.AnnData,
605
+ params: SpatialStatisticsParameters,
606
+ ctx: "ToolContext",
607
+ ) -> dict[str, Any]:
608
+ """
609
+ Performs Getis-Ord Gi* analysis to identify local spatial clusters.
610
+
611
+ This method identifies statistically significant hot spots (clusters of high
612
+ gene expression) and cold spots (clusters of low gene expression). It computes
613
+ a Z-score for each spot, where high positive Z-scores indicate hot spots and
614
+ low negative Z-scores indicate cold spots.
615
+
616
+ The significance threshold is determined by params.getis_ord_alpha, and
617
+ multiple testing correction is applied according to params.getis_ord_correction.
618
+
619
+ References
620
+ ----------
621
+ Getis, A. & Ord, J.K. (1992). The Analysis of Spatial Association by Use of
622
+ Distance Statistics. Geographical Analysis, 24(3), 189-206.
623
+
624
+ Ord, J.K. & Getis, A. (1995). Local Spatial Autocorrelation Statistics:
625
+ Distributional Issues and an Application. Geographical Analysis, 27(4), 286-306.
626
+ """
627
+ # Unified gene selection
628
+ genes = select_genes_for_analysis(
629
+ adata,
630
+ genes=params.genes,
631
+ n_genes=params.n_top_genes,
632
+ analysis_name="Getis-Ord Gi*",
633
+ )
634
+
635
+ getis_ord_results = {}
636
+
637
+ require("esda") # Raises ImportError with install instructions if missing
638
+ require("libpysal") # Raises ImportError with install instructions if missing
639
+ from esda.getisord import G_Local
640
+ from pysal.lib import weights
641
+ from scipy.stats import norm
642
+
643
+ try:
644
+
645
+ # Calculate Z-score threshold from alpha level (two-tailed test)
646
+ z_threshold = norm.ppf(1 - params.getis_ord_alpha / 2)
647
+
648
+ coords = require_spatial_coords(adata)
649
+ w = weights.KNN.from_array(coords, k=params.n_neighbors)
650
+ w.transform = "r"
651
+
652
+ # OPTIMIZATION: Extract all genes at once before loop (batch extraction)
653
+ # This provides 50-150x speedup by avoiding repeated AnnData slicing overhead
654
+ y_all_genes = to_dense(adata[:, genes].X)
655
+
656
+ # Collect all p-values for multiple testing correction
657
+ all_pvalues = {}
658
+
659
+ for i, gene in enumerate(genes):
660
+ # OPTIMIZATION: Direct indexing from pre-extracted dense matrix (fast!)
661
+ y = y_all_genes[:, i].astype(np.float64)
662
+
663
+ local_g = G_Local(y, w, transform="R", star=True)
664
+
665
+ # Store raw results in adata.obs
666
+ adata.obs[f"{gene}_getis_ord_z"] = local_g.Zs
667
+ adata.obs[f"{gene}_getis_ord_p"] = local_g.p_sim
668
+
669
+ # Store p-values for correction
670
+ all_pvalues[gene] = local_g.p_sim
671
+
672
+ # Count hotspots/coldspots using Z-threshold
673
+ getis_ord_results[gene] = {
674
+ "mean_z": float(np.mean(local_g.Zs)),
675
+ "std_z": float(np.std(local_g.Zs)),
676
+ "n_hot_spots": int(np.sum(local_g.Zs > z_threshold)),
677
+ "n_cold_spots": int(np.sum(local_g.Zs < -z_threshold)),
678
+ "n_significant_raw": int(
679
+ np.sum(local_g.p_sim < params.getis_ord_alpha)
680
+ ),
681
+ }
682
+
683
+ # Apply multiple testing correction if requested
684
+ if params.getis_ord_correction != "none" and len(genes) > 1:
685
+ if params.getis_ord_correction == "bonferroni":
686
+ corrected_alpha = params.getis_ord_alpha / len(genes)
687
+ corrected_z_threshold = norm.ppf(1 - corrected_alpha / 2)
688
+
689
+ for gene in genes:
690
+ p_values = all_pvalues[gene]
691
+ adata.obs[f"{gene}_getis_ord_p_corrected"] = np.minimum(
692
+ p_values * len(genes), 1.0
693
+ )
694
+
695
+ z_scores = adata.obs[f"{gene}_getis_ord_z"].values
696
+ getis_ord_results[gene]["n_hot_spots_corrected"] = int(
697
+ np.sum(z_scores > corrected_z_threshold)
698
+ )
699
+ getis_ord_results[gene]["n_cold_spots_corrected"] = int(
700
+ np.sum(z_scores < -corrected_z_threshold)
701
+ )
702
+
703
+ elif params.getis_ord_correction == "fdr_bh":
704
+ from statsmodels.stats.multitest import multipletests
705
+
706
+ for gene in genes:
707
+ p_values = all_pvalues[gene]
708
+ _, p_corrected, _, _ = multipletests(
709
+ p_values, alpha=params.getis_ord_alpha, method="fdr_bh"
710
+ )
711
+ adata.obs[f"{gene}_getis_ord_p_corrected"] = p_corrected
712
+
713
+ getis_ord_results[gene]["n_significant_corrected"] = int(
714
+ np.sum(p_corrected < params.getis_ord_alpha)
715
+ )
716
+
717
+ z_scores = adata.obs[f"{gene}_getis_ord_z"].values
718
+ significant_mask = p_corrected < params.getis_ord_alpha
719
+ getis_ord_results[gene]["n_hot_spots_corrected"] = int(
720
+ np.sum((z_scores > z_threshold) & significant_mask)
721
+ )
722
+ getis_ord_results[gene]["n_cold_spots_corrected"] = int(
723
+ np.sum((z_scores < -z_threshold) & significant_mask)
724
+ )
725
+
726
+ except Exception as e:
727
+ raise ProcessingError(f"Getis-Ord analysis failed: {e}") from e
728
+
729
+ return {
730
+ "method": "Getis-Ord Gi* (star=True)",
731
+ "n_genes_analyzed": len(getis_ord_results),
732
+ "genes_analyzed": list(getis_ord_results),
733
+ "parameters": {
734
+ "n_neighbors": params.n_neighbors,
735
+ "alpha": params.getis_ord_alpha,
736
+ "z_threshold": float(z_threshold),
737
+ "correction": params.getis_ord_correction,
738
+ },
739
+ "results": getis_ord_results,
740
+ }
741
+
742
+
743
+ def _analyze_centrality(
744
+ adata: ad.AnnData,
745
+ cluster_key: str,
746
+ ctx: "ToolContext",
747
+ ) -> dict[str, Any]:
748
+ """Compute centrality scores."""
749
+ sq.gr.centrality_scores(adata, cluster_key=cluster_key)
750
+
751
+ analysis_key = f"{cluster_key}_centrality_scores"
752
+ if analysis_key in adata.uns:
753
+ scores = adata.uns[analysis_key]
754
+
755
+ return {
756
+ "analysis_completed": True,
757
+ "analysis_key": analysis_key,
758
+ "n_clusters": len(scores) if isinstance(scores, dict) else "computed",
759
+ }
760
+
761
+ raise ProcessingError("Centrality analysis did not produce results")
762
+
763
+
764
+ # ============================================================================
765
+ # ADVANCED ANALYSIS FUNCTIONS (from spatial_statistics.py)
766
+ # ============================================================================
767
+
768
+
769
+ def _analyze_bivariate_moran(
770
+ adata: ad.AnnData,
771
+ params: SpatialStatisticsParameters,
772
+ ctx: "ToolContext",
773
+ ) -> dict[str, Any]:
774
+ """
775
+ Calculates Bivariate Moran's I to assess spatial correlation between two genes.
776
+
777
+ This statistic measures how the expression of one gene in a specific location
778
+ relates to the expression of a second gene in neighboring locations. It is useful
779
+ for identifying pairs of genes that are co-localized or spatially exclusive.
780
+ A positive value suggests that high expression of gene A is surrounded by high
781
+ expression of gene B.
782
+ """
783
+ # Get gene pairs from parameters - NO ARBITRARY DEFAULTS
784
+ if not params.gene_pairs:
785
+ raise ParameterError("Bivariate Moran's I requires gene_pairs parameter.")
786
+ gene_pairs = params.gene_pairs
787
+
788
+ results = {}
789
+
790
+ # Use centralized dependency manager for consistent error handling
791
+ require("libpysal") # Raises ImportError with install instructions if missing
792
+ from libpysal.weights import KNN
793
+
794
+ try:
795
+
796
+ coords = require_spatial_coords(adata)
797
+ w = KNN.from_array(coords, k=params.n_neighbors)
798
+ w.transform = "R"
799
+
800
+ # OPTIMIZATION: Extract all unique genes involved in pairs (batch extraction)
801
+ # This provides 20-40x speedup by avoiding repeated AnnData slicing
802
+ # See test_spatial_statistics_extreme_scale.py for performance validation
803
+ all_genes_in_pairs = list(
804
+ set([g for pair in gene_pairs for g in pair if g in adata.var_names])
805
+ )
806
+
807
+ expr_all = to_dense(adata[:, all_genes_in_pairs].X)
808
+
809
+ # Create gene index mapping for fast lookup
810
+ gene_to_idx = {gene: i for i, gene in enumerate(all_genes_in_pairs)}
811
+
812
+ for gene1, gene2 in gene_pairs:
813
+ if gene1 in adata.var_names and gene2 in adata.var_names:
814
+ # OPTIMIZATION: Direct indexing from pre-extracted matrix (fast!)
815
+ idx1 = gene_to_idx[gene1]
816
+ idx2 = gene_to_idx[gene2]
817
+ x = expr_all[:, idx1].flatten()
818
+ y = expr_all[:, idx2].flatten()
819
+
820
+ # Compute bivariate Moran's I using sparse matrix operations
821
+ # Formula: I_xy = (n / S0) * (x - x̄)ᵀ W (y - ȳ) / sqrt(Var(x) * Var(y))
822
+ # Reference: Wartenberg (1985), Anselin et al. (2002)
823
+ n = len(x)
824
+ x_mean = np.mean(x)
825
+ y_mean = np.mean(y)
826
+
827
+ # Centered values
828
+ x_centered = x - x_mean
829
+ y_centered = y - y_mean
830
+
831
+ # OPTIMIZED: Use sparse matrix multiplication instead of O(n²) loop
832
+ # numerator = Σᵢ Σⱼ wᵢⱼ(xᵢ - x̄)(yⱼ - ȳ) = (x - x̄)ᵀ @ W @ (y - ȳ)
833
+ numerator = float(x_centered @ w.sparse @ y_centered)
834
+
835
+ # FIX: Bivariate Moran's I uses sqrt of product of both variances
836
+ # Not just x's variance (which was the bug)
837
+ var_x = np.sum(x_centered**2)
838
+ var_y = np.sum(y_centered**2)
839
+ denominator = np.sqrt(var_x * var_y)
840
+
841
+ if denominator > 0:
842
+ moran_i = (n / w.sparse.sum()) * (numerator / denominator)
843
+ else:
844
+ moran_i = 0.0
845
+
846
+ results[f"{gene1}_vs_{gene2}"] = float(moran_i)
847
+
848
+ except Exception as e:
849
+ raise ProcessingError(f"Bivariate Moran's I failed: {e}") from e
850
+
851
+ return {
852
+ "n_pairs_analyzed": len(results),
853
+ "bivariate_morans_i": results,
854
+ "mean_bivariate_i": float(np.mean(list(results.values()))) if results else 0,
855
+ }
856
+
857
+
858
+ def _analyze_join_count(
859
+ adata: ad.AnnData,
860
+ cluster_key: str,
861
+ params: SpatialStatisticsParameters,
862
+ ctx: "ToolContext",
863
+ ) -> dict[str, Any]:
864
+ """
865
+ Compute traditional Join Count statistics for BINARY categorical spatial data.
866
+
867
+ IMPORTANT: This method only works for binary data (exactly 2 categories).
868
+ For multi-category data (>2 categories), use 'local_join_count' instead.
869
+
870
+ Join Count statistics (Cliff & Ord 1981) measure spatial autocorrelation in
871
+ binary categorical data by counting the number of joins between neighboring
872
+ spatial units of the same or different categories.
873
+
874
+ Returns three types of joins:
875
+ - BB (Black-Black): Both neighbors are category 1
876
+ - WW (White-White): Both neighbors are category 0
877
+ - BW (Black-White): Neighbors are different categories
878
+
879
+ Parameters
880
+ ----------
881
+ adata : AnnData
882
+ Annotated data object with spatial coordinates in .obsm['spatial']
883
+ cluster_key : str
884
+ Column in adata.obs containing the categorical variable (must have exactly 2 categories)
885
+ params : SpatialStatisticsParameters
886
+ Analysis parameters including n_neighbors
887
+ ctx : ToolContext
888
+ ToolContext for logging and data access
889
+
890
+ Returns
891
+ -------
892
+ Dict[str, Any]
893
+ Dictionary containing:
894
+ - bb: Number of Black-Black joins
895
+ - ww: Number of White-White joins
896
+ - bw: Number of Black-White joins
897
+ - J: Total number of joins
898
+ - p_value: Significance level from permutation test
899
+
900
+ References
901
+ ----------
902
+ Cliff, A.D. & Ord, J.K. (1981). Spatial Processes. Pion, London.
903
+
904
+ See Also
905
+ --------
906
+ _analyze_local_join_count : For multi-category data (>2 categories)
907
+ """
908
+ # Check for required dependencies
909
+ if not is_available("esda") or not is_available("libpysal"):
910
+ raise DependencyError(
911
+ "esda or libpysal package not installed. Install with: pip install esda libpysal"
912
+ )
913
+
914
+ try:
915
+ from esda.join_counts import Join_Counts
916
+ from libpysal.weights import KNN
917
+
918
+ coords = require_spatial_coords(adata)
919
+ w = KNN.from_array(coords, k=params.n_neighbors)
920
+
921
+ # Get categorical data
922
+ y = adata.obs[cluster_key].cat.codes.values
923
+
924
+ # Compute join counts
925
+ jc = Join_Counts(y, w)
926
+
927
+ return {
928
+ "bb": float(jc.bb), # Black-Black joins
929
+ "ww": float(jc.ww), # White-White joins
930
+ "bw": float(jc.bw), # Black-White joins
931
+ "J": float(jc.J), # Total joins
932
+ "p_value": float(jc.p_sim) if hasattr(jc, "p_sim") else None,
933
+ }
934
+
935
+ except Exception as e:
936
+ raise ProcessingError(f"Join Count analysis failed: {e}") from e
937
+
938
+
939
+ def _analyze_local_join_count(
940
+ adata: ad.AnnData,
941
+ cluster_key: str,
942
+ params: SpatialStatisticsParameters,
943
+ ctx: "ToolContext",
944
+ ) -> dict[str, Any]:
945
+ """
946
+ Compute Local Join Count statistics for MULTI-CATEGORY categorical spatial data.
947
+
948
+ This method extends traditional Join Count statistics to handle data with more than
949
+ 2 categories by using Local Join Count Statistics (Anselin & Li 2019). Each category
950
+ is converted to a binary indicator variable, and local statistics are computed to
951
+ identify spatial clusters of each category.
952
+
953
+ WHEN TO USE:
954
+ - Data has MORE THAN 2 categories (e.g., cell types, tissue domains)
955
+ - Want to identify WHERE each category spatially clusters
956
+ - Need category-specific clustering patterns
957
+
958
+ For binary data (exactly 2 categories), use 'join_count' instead for traditional
959
+ global statistics.
960
+
961
+ METHOD:
962
+ 1. One-hot encode: Convert multi-category variable to binary indicators
963
+ 2. For each category: Compute local join count (# of same-category neighbors)
964
+ 3. Permutation test: Assess statistical significance
965
+ 4. Store results: Local statistics in adata.obs, summary in return value
966
+
967
+ Parameters
968
+ ----------
969
+ adata : AnnData
970
+ Annotated data object with spatial coordinates in .obsm['spatial']
971
+ cluster_key : str
972
+ Column in adata.obs containing the categorical variable (can have any number of categories)
973
+ params : SpatialStatisticsParameters
974
+ Analysis parameters including n_neighbors
975
+ ctx : ToolContext
976
+ ToolContext for logging and data access
977
+
978
+ Returns
979
+ -------
980
+ Dict[str, Any]
981
+ Dictionary containing:
982
+ - method: Method name and reference
983
+ - n_categories: Number of categories analyzed
984
+ - categories: List of category names
985
+ - per_category_stats: Statistics for each category
986
+ - total_joins: Sum of local join counts across all locations
987
+ - mean_local_joins: Average local join count per location
988
+ - n_significant: Number of locations with significant clustering (p < 0.05)
989
+ - n_hotspots: Number of locations with positive significant clustering
990
+ - interpretation: How to interpret the results
991
+
992
+ Notes
993
+ -----
994
+ Results are stored in adata.obs as:
995
+ - 'ljc_{category}': Local join count values for each category
996
+ - 'ljc_{category}_pvalue': Significance levels (from permutation test)
997
+
998
+ High local join count values indicate locations where category members cluster together.
999
+ P-values < 0.05 indicate statistically significant local clustering.
1000
+
1001
+ References
1002
+ ----------
1003
+ Anselin, L., & Li, X. (2019). Operational Local Join Count Statistics for Cluster Detection.
1004
+ Journal of geographical systems, 21(2), 189–210.
1005
+ https://doi.org/10.1007/s10109-019-00299-x
1006
+
1007
+ See Also
1008
+ --------
1009
+ _analyze_join_count : For binary data (2 categories) using traditional Join Count
1010
+
1011
+ Examples
1012
+ --------
1013
+ For a dataset with 7 cell type categories:
1014
+ >>> result = await _analyze_local_join_count(adata, 'leiden', params, ctx)
1015
+ >>> # Check which cell types show significant clustering
1016
+ >>> for cat, stats in result['per_category_stats'].items():
1017
+ ... print(f"{cat}: {stats['n_hotspots']} significant hotspots")
1018
+ """
1019
+ # Check for required dependencies
1020
+ if not is_available("esda") or not is_available("libpysal"):
1021
+ raise DependencyError(
1022
+ "esda or libpysal package not installed (requires esda >= 2.4.0). "
1023
+ "Install with: pip install esda libpysal"
1024
+ )
1025
+
1026
+ try:
1027
+ from esda.join_counts_local import Join_Counts_Local
1028
+ from libpysal.weights import KNN
1029
+
1030
+ coords = require_spatial_coords(adata)
1031
+
1032
+ # Create PySAL W object directly from coordinates using KNN
1033
+ # This ensures compatibility with Join_Counts_Local
1034
+ w = KNN.from_array(coords, k=params.n_neighbors)
1035
+
1036
+ # Get unique categories
1037
+ categories = adata.obs[cluster_key].unique()
1038
+ n_categories = len(categories)
1039
+
1040
+ results = {}
1041
+
1042
+ # Analyze each category separately
1043
+ for category in categories:
1044
+ # Create binary indicator: 1 if cell is this category, 0 otherwise
1045
+ y = (adata.obs[cluster_key] == category).astype(int).values
1046
+
1047
+ # Compute Local Join Count statistics
1048
+ ljc = Join_Counts_Local(connectivity=w).fit(y)
1049
+
1050
+ # Store local statistics in adata.obs
1051
+ adata.obs[f"ljc_{category}"] = ljc.LJC
1052
+ adata.obs[f"ljc_{category}_pvalue"] = ljc.p_sim
1053
+
1054
+ # Compute summary statistics
1055
+ results[str(category)] = {
1056
+ "total_joins": float(ljc.LJC.sum()),
1057
+ "mean_local_joins": float(ljc.LJC.mean()),
1058
+ "std_local_joins": float(ljc.LJC.std()),
1059
+ "n_significant": int((ljc.p_sim < 0.05).sum()),
1060
+ "n_hotspots": int(((ljc.LJC > 0) & (ljc.p_sim < 0.05)).sum()),
1061
+ }
1062
+
1063
+ # Store summary in adata.uns
1064
+ adata.uns["local_join_count"] = {
1065
+ "method": "Local Join Count Statistics (Anselin & Li 2019)",
1066
+ "cluster_key": cluster_key,
1067
+ "n_categories": n_categories,
1068
+ "categories": [str(c) for c in categories],
1069
+ "n_neighbors": params.n_neighbors,
1070
+ "per_category_stats": results,
1071
+ }
1072
+
1073
+ return {
1074
+ "method": "Local Join Count Statistics (Anselin & Li 2019)",
1075
+ "n_categories": n_categories,
1076
+ "categories": [str(c) for c in categories],
1077
+ "per_category_stats": results,
1078
+ "interpretation": (
1079
+ "Local Join Count statistics identify spatial clusters for each category. "
1080
+ "High LJC values indicate locations where category members cluster together. "
1081
+ "P-values < 0.05 indicate statistically significant local clustering. "
1082
+ "Results stored in adata.obs as 'ljc_{category}' and 'ljc_{category}_pvalue'."
1083
+ ),
1084
+ }
1085
+
1086
+ except Exception as e:
1087
+ raise ProcessingError(f"Local Join Count analysis failed: {e}") from e
1088
+
1089
+
1090
+ def _analyze_network_properties(
1091
+ adata: ad.AnnData,
1092
+ cluster_key: str,
1093
+ params: SpatialStatisticsParameters,
1094
+ ctx: "ToolContext",
1095
+ ) -> dict[str, Any]:
1096
+ """
1097
+ Analyze network properties of spatial graph.
1098
+
1099
+ Migrated from spatial_statistics.py
1100
+ """
1101
+ # Check for required dependencies
1102
+ if not is_available("networkx"):
1103
+ raise DependencyError(
1104
+ "networkx package required. Install with: pip install networkx"
1105
+ )
1106
+
1107
+ try:
1108
+ import networkx as nx
1109
+ from scipy.sparse import csr_matrix # noqa: F401
1110
+
1111
+ # Get or create spatial connectivity
1112
+ if "spatial_connectivities" in adata.obsp:
1113
+ conn_matrix = adata.obsp["spatial_connectivities"]
1114
+ else:
1115
+ # Create connectivity matrix
1116
+ from sklearn.neighbors import kneighbors_graph
1117
+
1118
+ coords = require_spatial_coords(adata)
1119
+ conn_matrix = kneighbors_graph(
1120
+ coords, n_neighbors=params.n_neighbors, mode="connectivity"
1121
+ )
1122
+
1123
+ # Convert to networkx graph
1124
+ G = nx.from_scipy_sparse_array(conn_matrix)
1125
+
1126
+ # Compute properties
1127
+ properties = {
1128
+ "n_nodes": G.number_of_nodes(),
1129
+ "n_edges": G.number_of_edges(),
1130
+ "density": float(nx.density(G)),
1131
+ "is_connected": nx.is_connected(G),
1132
+ "n_components": nx.number_connected_components(G),
1133
+ }
1134
+
1135
+ # Additional metrics for connected graphs
1136
+ if properties["is_connected"]:
1137
+ properties["diameter"] = nx.diameter(G)
1138
+ properties["radius"] = nx.radius(G)
1139
+ else:
1140
+ # Analyze largest component
1141
+ largest_cc = max(nx.connected_components(G), key=len)
1142
+ G.subgraph(largest_cc)
1143
+ properties["largest_component_size"] = len(largest_cc)
1144
+ properties["largest_component_fraction"] = (
1145
+ len(largest_cc) / G.number_of_nodes()
1146
+ )
1147
+
1148
+ # Clustering coefficient
1149
+ try:
1150
+ properties["avg_clustering"] = float(nx.average_clustering(G))
1151
+ except Exception:
1152
+ properties["avg_clustering"] = None
1153
+
1154
+ # Degree statistics
1155
+ degrees = dict(G.degree())
1156
+ degree_values = list(degrees.values())
1157
+ properties["degree_mean"] = float(np.mean(degree_values))
1158
+ properties["degree_std"] = float(np.std(degree_values))
1159
+
1160
+ return properties
1161
+
1162
+ except Exception as e:
1163
+ raise ProcessingError(f"Network properties analysis failed: {e}") from e
1164
+
1165
+
1166
+ def _analyze_spatial_centrality(
1167
+ adata: ad.AnnData,
1168
+ cluster_key: str,
1169
+ params: SpatialStatisticsParameters,
1170
+ ctx: "ToolContext",
1171
+ ) -> dict[str, Any]:
1172
+ """
1173
+ Compute various centrality measures for spatial network.
1174
+
1175
+ Migrated from spatial_statistics.py
1176
+ """
1177
+ # Check for required dependencies
1178
+ if not is_available("networkx"):
1179
+ raise DependencyError(
1180
+ "NetworkX required for centrality analysis. Install with: pip install networkx"
1181
+ )
1182
+
1183
+ try:
1184
+ import networkx as nx
1185
+
1186
+ # Get connectivity matrix
1187
+ if "spatial_connectivities" in adata.obsp:
1188
+ conn_matrix = adata.obsp["spatial_connectivities"]
1189
+ else:
1190
+ from sklearn.neighbors import kneighbors_graph
1191
+
1192
+ coords = require_spatial_coords(adata)
1193
+ conn_matrix = kneighbors_graph(
1194
+ coords, n_neighbors=params.n_neighbors, mode="connectivity"
1195
+ )
1196
+
1197
+ # Convert to networkx
1198
+ G = nx.from_scipy_sparse_array(conn_matrix)
1199
+
1200
+ # Compute centrality measures (returns dict with integer keys)
1201
+ degree_centrality = nx.degree_centrality(G)
1202
+ closeness_centrality = nx.closeness_centrality(G)
1203
+ betweenness_centrality = nx.betweenness_centrality(G)
1204
+
1205
+ # FIX: NetworkX returns {0: val0, 1: val1, ...} with integer keys,
1206
+ # but adata.obs_names are strings. We need to extract values in order.
1207
+ # Bug: pd.Series(dict) cannot align integer keys to string obs_names
1208
+ n_nodes = adata.n_obs
1209
+
1210
+ # Validate that all expected node keys exist in centrality results
1211
+ # This catches edge cases like disconnected graphs or isolated nodes
1212
+ expected_keys = set(range(n_nodes))
1213
+ missing_degree = expected_keys - set(degree_centrality.keys())
1214
+ missing_closeness = expected_keys - set(closeness_centrality.keys())
1215
+ missing_betweenness = expected_keys - set(betweenness_centrality.keys())
1216
+
1217
+ if missing_degree or missing_closeness or missing_betweenness:
1218
+ await ctx.warning(
1219
+ f"Centrality computation incomplete: "
1220
+ f"missing degree={len(missing_degree)}, "
1221
+ f"closeness={len(missing_closeness)}, "
1222
+ f"betweenness={len(missing_betweenness)} nodes. "
1223
+ f"Graph may have disconnected components."
1224
+ )
1225
+
1226
+ # Use .get() with default 0.0 for missing nodes (isolated/disconnected)
1227
+ degree_vals = np.array([degree_centrality.get(i, 0.0) for i in range(n_nodes)])
1228
+ closeness_vals = np.array(
1229
+ [closeness_centrality.get(i, 0.0) for i in range(n_nodes)]
1230
+ )
1231
+ betweenness_vals = np.array(
1232
+ [betweenness_centrality.get(i, 0.0) for i in range(n_nodes)]
1233
+ )
1234
+
1235
+ # Store in adata.obs (directly as numpy array)
1236
+ adata.obs["degree_centrality"] = degree_vals
1237
+ adata.obs["closeness_centrality"] = closeness_vals
1238
+ adata.obs["betweenness_centrality"] = betweenness_vals
1239
+
1240
+ # Compute statistics by cluster
1241
+ centrality_stats = {}
1242
+ for cluster in adata.obs[cluster_key].unique():
1243
+ mask = adata.obs[cluster_key] == cluster
1244
+ centrality_stats[str(cluster)] = {
1245
+ "mean_degree": float(adata.obs.loc[mask, "degree_centrality"].mean()),
1246
+ "mean_closeness": float(
1247
+ adata.obs.loc[mask, "closeness_centrality"].mean()
1248
+ ),
1249
+ "mean_betweenness": float(
1250
+ adata.obs.loc[mask, "betweenness_centrality"].mean()
1251
+ ),
1252
+ }
1253
+
1254
+ return {
1255
+ "centrality_computed": True,
1256
+ "cluster_centrality": centrality_stats,
1257
+ "global_stats": {
1258
+ "mean_degree": float(np.mean(list(degree_centrality.values()))),
1259
+ "mean_closeness": float(np.mean(list(closeness_centrality.values()))),
1260
+ "mean_betweenness": float(
1261
+ np.mean(list(betweenness_centrality.values()))
1262
+ ),
1263
+ },
1264
+ }
1265
+
1266
+ except Exception as e:
1267
+ raise ProcessingError(f"Spatial centrality analysis failed: {e}") from e
1268
+
1269
+
1270
+ def _analyze_local_moran(
1271
+ adata: ad.AnnData,
1272
+ params: SpatialStatisticsParameters,
1273
+ ctx: "ToolContext",
1274
+ ) -> dict[str, Any]:
1275
+ """
1276
+ Calculate Local Moran's I (LISA) for spatial clustering detection.
1277
+
1278
+ Local Moran's I identifies spatial clusters and outliers by measuring
1279
+ the local spatial autocorrelation for each observation.
1280
+
1281
+ Parameters
1282
+ ----------
1283
+ adata : ad.AnnData
1284
+ Annotated data object
1285
+ params : SpatialStatisticsParameters
1286
+ Analysis parameters including genes to analyze
1287
+ ctx : ToolContext
1288
+ ToolContext for logging and data access
1289
+
1290
+ Returns
1291
+ -------
1292
+ Dict[str, Any]
1293
+ Results including Local Moran's I values and statistics for each gene
1294
+
1295
+ Notes
1296
+ -----
1297
+ This implementation uses PySAL's esda.Moran_Local with permutation-based
1298
+ significance testing, following best practices from:
1299
+ - GeoDa Center: https://geodacenter.github.io/workbook/6a_local_auto/lab6a.html
1300
+ - PySAL documentation: https://pysal.org/esda/generated/esda.Moran_Local.html
1301
+
1302
+ The permutation approach holds each observation fixed while randomly permuting
1303
+ the remaining n-1 values to generate a reference distribution for significance
1304
+ testing. This is more robust than parametric approaches as it makes fewer
1305
+ distributional assumptions.
1306
+
1307
+ Quadrant classification (LISA clusters):
1308
+ - HH (High-High): Hot spots - high values surrounded by high values
1309
+ - LL (Low-Low): Cold spots - low values surrounded by low values
1310
+ - HL (High-Low): High outliers - high values surrounded by low values
1311
+ - LH (Low-High): Low outliers - low values surrounded by high values
1312
+ """
1313
+ # Import PySAL components for proper LISA analysis
1314
+ require("esda") # Raises ImportError with install instructions if missing
1315
+ require("libpysal") # Raises ImportError with install instructions if missing
1316
+ from esda.moran import Moran_Local
1317
+ from libpysal.weights import W as PySALWeights
1318
+
1319
+ try:
1320
+ # Ensure spatial neighbors exist
1321
+ await ensure_spatial_neighbors_async(adata, ctx, n_neighs=params.n_neighbors)
1322
+
1323
+ # Unified gene selection (default 5 genes for computational efficiency)
1324
+ n_genes = (
1325
+ min(5, params.n_top_genes) if params.genes is None else params.n_top_genes
1326
+ )
1327
+ valid_genes = select_genes_for_analysis(
1328
+ adata,
1329
+ genes=params.genes,
1330
+ n_genes=n_genes,
1331
+ analysis_name="Local Moran's I (LISA)",
1332
+ )
1333
+
1334
+ # Convert spatial connectivity matrix to PySAL weights format
1335
+ W_sparse = adata.obsp["spatial_connectivities"]
1336
+
1337
+ # Create PySAL weights from sparse matrix using optimized CSR access
1338
+ # Direct CSR array access avoids per-row object creation (15x faster)
1339
+ from scipy.sparse import csr_matrix
1340
+
1341
+ if not isinstance(W_sparse, csr_matrix):
1342
+ W_sparse = csr_matrix(W_sparse)
1343
+
1344
+ neighbors_dict = {}
1345
+ weights_dict = {}
1346
+ n_obs = W_sparse.shape[0]
1347
+
1348
+ # Direct access to CSR internal arrays
1349
+ indptr = W_sparse.indptr
1350
+ indices = W_sparse.indices
1351
+ data = W_sparse.data
1352
+
1353
+ for i in range(n_obs):
1354
+ start, end = indptr[i], indptr[i + 1]
1355
+ neighbors_dict[i] = indices[start:end].tolist()
1356
+ weights_dict[i] = data[start:end].tolist()
1357
+
1358
+ w = PySALWeights(neighbors_dict, weights_dict)
1359
+
1360
+ # Get analysis parameters
1361
+ permutations = params.local_moran_permutations
1362
+ alpha = params.local_moran_alpha
1363
+ use_fdr = params.local_moran_fdr_correction
1364
+
1365
+ # Memory-efficient streaming: extract one gene at a time
1366
+ # This reduces memory from O(n_spots × n_genes) to O(n_spots)
1367
+ # Critical for large datasets (Visium HD: 50K+ spots × 500 genes = 200MB+)
1368
+ results = {}
1369
+ for gene in valid_genes:
1370
+ # Extract single gene column - memory efficient for sparse matrices
1371
+ gene_idx = adata.var_names.get_loc(gene)
1372
+ expr = to_dense(adata.X[:, gene_idx]).flatten()
1373
+
1374
+ # CRITICAL: Convert to float64 for PySAL/numba compatibility
1375
+ # PySAL's Moran_Local uses numba JIT compilation which requires
1376
+ # consistent dtypes (float64) for matrix operations
1377
+ expr = expr.astype(np.float64, copy=False)
1378
+
1379
+ # Run PySAL Local Moran's I with permutation testing
1380
+ lisa = Moran_Local(expr, w, permutations=permutations)
1381
+
1382
+ # Store local I values in adata.obs
1383
+ adata.obs[f"{gene}_local_morans"] = lisa.Is
1384
+
1385
+ # Get p-values from permutation test
1386
+ p_values = lisa.p_sim
1387
+
1388
+ # Apply FDR correction if requested
1389
+ if use_fdr and permutations > 0:
1390
+ # Check statsmodels availability for FDR correction
1391
+ require(
1392
+ "statsmodels"
1393
+ ) # Raises ImportError with install instructions if missing
1394
+ from statsmodels.stats.multitest import multipletests
1395
+
1396
+ _, p_corrected, _, _ = multipletests(
1397
+ p_values, alpha=alpha, method="fdr_bh"
1398
+ )
1399
+ significant = p_corrected < alpha
1400
+ else:
1401
+ significant = p_values < alpha
1402
+
1403
+ # Classify by quadrant AND significance
1404
+ # PySAL quadrant codes: 1=HH, 2=LH, 3=LL, 4=HL
1405
+ q = lisa.q
1406
+
1407
+ # Hot spots: High-High clusters (significant positive spatial autocorrelation)
1408
+ hotspots = np.where((q == 1) & significant)[0].tolist()
1409
+ # Cold spots: Low-Low clusters (significant positive spatial autocorrelation)
1410
+ coldspots = np.where((q == 3) & significant)[0].tolist()
1411
+ # High outliers: High values surrounded by low values
1412
+ high_outliers = np.where((q == 4) & significant)[0].tolist()
1413
+ # Low outliers: Low values surrounded by high values
1414
+ low_outliers = np.where((q == 2) & significant)[0].tolist()
1415
+
1416
+ # Store quadrant classification in adata.obs
1417
+ quadrant_labels = np.array(["Not Significant"] * n_obs)
1418
+ quadrant_labels[(q == 1) & significant] = "HH (Hot Spot)"
1419
+ quadrant_labels[(q == 3) & significant] = "LL (Cold Spot)"
1420
+ quadrant_labels[(q == 4) & significant] = "HL (High Outlier)"
1421
+ quadrant_labels[(q == 2) & significant] = "LH (Low Outlier)"
1422
+ adata.obs[f"{gene}_lisa_cluster"] = pd.Categorical(quadrant_labels)
1423
+
1424
+ # Store p-values
1425
+ adata.obs[f"{gene}_lisa_pvalue"] = p_values
1426
+
1427
+ results[gene] = {
1428
+ "mean_I": float(np.mean(lisa.Is)),
1429
+ "std_I": float(np.std(lisa.Is)),
1430
+ "min_I": float(np.min(lisa.Is)),
1431
+ "max_I": float(np.max(lisa.Is)),
1432
+ "n_significant": int(np.sum(significant)),
1433
+ "n_hotspots": len(hotspots), # HH clusters
1434
+ "n_coldspots": len(coldspots), # LL clusters
1435
+ "n_high_outliers": len(high_outliers), # HL
1436
+ "n_low_outliers": len(low_outliers), # LH
1437
+ "permutations": permutations,
1438
+ "alpha": alpha,
1439
+ "fdr_corrected": use_fdr,
1440
+ }
1441
+
1442
+ # Store summary in uns
1443
+ adata.uns["local_moran"] = {
1444
+ "genes_analyzed": valid_genes,
1445
+ "n_neighbors": params.n_neighbors,
1446
+ "permutations": permutations,
1447
+ "alpha": alpha,
1448
+ "fdr_corrected": use_fdr,
1449
+ "results": results,
1450
+ "method": "PySAL esda.Moran_Local",
1451
+ "reference": "Anselin, L. (1995). Local Indicators of Spatial Association - LISA",
1452
+ }
1453
+
1454
+ return {
1455
+ "analysis_type": "local_moran",
1456
+ "genes_analyzed": valid_genes,
1457
+ "results": results,
1458
+ "parameters": {
1459
+ "permutations": permutations,
1460
+ "alpha": alpha,
1461
+ "fdr_corrected": use_fdr,
1462
+ "n_neighbors": params.n_neighbors,
1463
+ },
1464
+ "interpretation": (
1465
+ "LISA (Local Indicators of Spatial Association) identifies statistically "
1466
+ "significant spatial clusters and outliers using permutation-based testing. "
1467
+ "HH (Hot Spots): high values clustered together. "
1468
+ "LL (Cold Spots): low values clustered together. "
1469
+ "HL/LH (Outliers): values significantly different from neighbors. "
1470
+ f"Significance determined by {permutations} permutations "
1471
+ f"with alpha={alpha}{' and FDR correction' if use_fdr else ''}."
1472
+ ),
1473
+ }
1474
+
1475
+ except Exception as e:
1476
+ raise ProcessingError(f"Local Moran's I analysis failed: {e}") from e