chatspatial 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. chatspatial/__init__.py +11 -0
  2. chatspatial/__main__.py +141 -0
  3. chatspatial/cli/__init__.py +7 -0
  4. chatspatial/config.py +53 -0
  5. chatspatial/models/__init__.py +85 -0
  6. chatspatial/models/analysis.py +513 -0
  7. chatspatial/models/data.py +2462 -0
  8. chatspatial/server.py +1763 -0
  9. chatspatial/spatial_mcp_adapter.py +720 -0
  10. chatspatial/tools/__init__.py +3 -0
  11. chatspatial/tools/annotation.py +1903 -0
  12. chatspatial/tools/cell_communication.py +1603 -0
  13. chatspatial/tools/cnv_analysis.py +605 -0
  14. chatspatial/tools/condition_comparison.py +595 -0
  15. chatspatial/tools/deconvolution/__init__.py +402 -0
  16. chatspatial/tools/deconvolution/base.py +318 -0
  17. chatspatial/tools/deconvolution/card.py +244 -0
  18. chatspatial/tools/deconvolution/cell2location.py +326 -0
  19. chatspatial/tools/deconvolution/destvi.py +144 -0
  20. chatspatial/tools/deconvolution/flashdeconv.py +101 -0
  21. chatspatial/tools/deconvolution/rctd.py +317 -0
  22. chatspatial/tools/deconvolution/spotlight.py +216 -0
  23. chatspatial/tools/deconvolution/stereoscope.py +109 -0
  24. chatspatial/tools/deconvolution/tangram.py +135 -0
  25. chatspatial/tools/differential.py +625 -0
  26. chatspatial/tools/embeddings.py +298 -0
  27. chatspatial/tools/enrichment.py +1863 -0
  28. chatspatial/tools/integration.py +807 -0
  29. chatspatial/tools/preprocessing.py +723 -0
  30. chatspatial/tools/spatial_domains.py +808 -0
  31. chatspatial/tools/spatial_genes.py +836 -0
  32. chatspatial/tools/spatial_registration.py +441 -0
  33. chatspatial/tools/spatial_statistics.py +1476 -0
  34. chatspatial/tools/trajectory.py +495 -0
  35. chatspatial/tools/velocity.py +405 -0
  36. chatspatial/tools/visualization/__init__.py +155 -0
  37. chatspatial/tools/visualization/basic.py +393 -0
  38. chatspatial/tools/visualization/cell_comm.py +699 -0
  39. chatspatial/tools/visualization/cnv.py +320 -0
  40. chatspatial/tools/visualization/core.py +684 -0
  41. chatspatial/tools/visualization/deconvolution.py +852 -0
  42. chatspatial/tools/visualization/enrichment.py +660 -0
  43. chatspatial/tools/visualization/integration.py +205 -0
  44. chatspatial/tools/visualization/main.py +164 -0
  45. chatspatial/tools/visualization/multi_gene.py +739 -0
  46. chatspatial/tools/visualization/persistence.py +335 -0
  47. chatspatial/tools/visualization/spatial_stats.py +469 -0
  48. chatspatial/tools/visualization/trajectory.py +639 -0
  49. chatspatial/tools/visualization/velocity.py +411 -0
  50. chatspatial/utils/__init__.py +115 -0
  51. chatspatial/utils/adata_utils.py +1372 -0
  52. chatspatial/utils/compute.py +327 -0
  53. chatspatial/utils/data_loader.py +499 -0
  54. chatspatial/utils/dependency_manager.py +462 -0
  55. chatspatial/utils/device_utils.py +165 -0
  56. chatspatial/utils/exceptions.py +185 -0
  57. chatspatial/utils/image_utils.py +267 -0
  58. chatspatial/utils/mcp_utils.py +137 -0
  59. chatspatial/utils/path_utils.py +243 -0
  60. chatspatial/utils/persistence.py +78 -0
  61. chatspatial/utils/scipy_compat.py +143 -0
  62. chatspatial-1.1.0.dist-info/METADATA +242 -0
  63. chatspatial-1.1.0.dist-info/RECORD +67 -0
  64. chatspatial-1.1.0.dist-info/WHEEL +5 -0
  65. chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
  66. chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
  67. chatspatial-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1603 @@
1
+ """
2
+ Cell-cell communication analysis tools for spatial transcriptomics data.
3
+ """
4
+
5
+ from typing import TYPE_CHECKING, Any, Optional
6
+
7
+ import numpy as np
8
+
9
+ if TYPE_CHECKING:
10
+ from ..spatial_mcp_adapter import ToolContext
11
+
12
+ from ..models.analysis import CellCommunicationResult
13
+ from ..models.data import CellCommunicationParameters
14
+ from ..utils import validate_obs_column
15
+ from ..utils.adata_utils import get_spatial_key, to_dense
16
+ from ..utils.dependency_manager import require, validate_r_package
17
+ from ..utils.exceptions import (
18
+ DataNotFoundError,
19
+ DependencyError,
20
+ ParameterError,
21
+ ProcessingError,
22
+ )
23
+
24
+
25
+ async def _validate_liana_requirements(
26
+ adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
27
+ ) -> None:
28
+ """Validate LIANA+ requirements"""
29
+ # Spatial connectivity validation
30
+ if params.perform_spatial_analysis and "spatial_connectivities" not in adata.obsp:
31
+ raise DataNotFoundError(
32
+ "Spatial connectivity required for LIANA+ bivariate analysis.\n\n"
33
+ "Run spatial neighbor computation first:\n"
34
+ " import squidpy as sq\n"
35
+ " sq.gr.spatial_neighbors(adata, coord_type='grid', n_rings=1)\n\n"
36
+ "Platform-specific recommendations:\n"
37
+ " Visium: coord_type='grid', n_rings=1-2\n"
38
+ " MERFISH: coord_type='generic', radius=20-50\n"
39
+ " Slide-seq: coord_type='generic', n_neighs=10-30"
40
+ )
41
+
42
+ # Cell type validation
43
+ validate_obs_column(adata, params.cell_type_key, "Cell type")
44
+
45
+ # Warning for resource matching
46
+ if params.species == "mouse" and params.liana_resource == "consensus":
47
+ await ctx.warning(
48
+ "Using 'consensus' for mouse data. Consider liana_resource='mouseconsensus'."
49
+ )
50
+
51
+
52
+ async def analyze_cell_communication(
53
+ data_id: str,
54
+ ctx: "ToolContext",
55
+ params: CellCommunicationParameters, # No default - must be provided by caller (LLM)
56
+ ) -> CellCommunicationResult:
57
+ """Analyze cell-cell communication in spatial transcriptomics data
58
+
59
+ Args:
60
+ data_id: Dataset ID
61
+ ctx: ToolContext for data access and logging
62
+ params: Cell communication analysis parameters
63
+
64
+ Returns:
65
+ Cell communication analysis result
66
+ """
67
+ # Get data via ToolContext
68
+ adata = await ctx.get_adata(data_id)
69
+
70
+ try:
71
+ # Apply method-specific validation
72
+ if params.method == "liana":
73
+ # LIANA-based methods need spatial connectivity validation
74
+ await _validate_liana_requirements(adata, params, ctx)
75
+ elif params.method == "cellphonedb":
76
+ # Check if cell type column exists
77
+ validate_obs_column(adata, params.cell_type_key, "Cell type")
78
+
79
+ # Check for low counts
80
+ n_genes = adata.raw.n_vars if adata.raw is not None else adata.n_vars
81
+ if n_genes < 5000:
82
+ await ctx.warning(
83
+ f"Gene count ({n_genes}) is relatively low. "
84
+ f"This may limit the number of interactions found."
85
+ )
86
+
87
+ if adata.n_obs < 100:
88
+ await ctx.warning(
89
+ f"Cell count ({adata.n_obs}) is relatively low. "
90
+ f"This may affect statistical power."
91
+ )
92
+
93
+ # Note: LIANA internally handles use_raw parameter automatically
94
+ # No need for manual data_source switching - consistent with other tools
95
+
96
+ # Analyze cell communication using selected method
97
+ if params.method == "liana":
98
+ require("liana", ctx, feature="LIANA+ cell communication analysis")
99
+ result_data = await _analyze_communication_liana(adata, params, ctx)
100
+
101
+ elif params.method == "cellphonedb":
102
+ require(
103
+ "cellphonedb", ctx, feature="CellPhoneDB cell communication analysis"
104
+ )
105
+ result_data = await _analyze_communication_cellphonedb(adata, params, ctx)
106
+
107
+ elif params.method == "cellchat_r":
108
+ validate_r_package(
109
+ "CellChat",
110
+ ctx,
111
+ install_cmd="devtools::install_github('jinworks/CellChat')",
112
+ )
113
+ result_data = _analyze_communication_cellchat_r(adata, params, ctx)
114
+
115
+ elif params.method == "fastccc":
116
+ require("fastccc", ctx, feature="FastCCC cell communication analysis")
117
+ result_data = await _analyze_communication_fastccc(adata, params, ctx)
118
+
119
+ else:
120
+ raise ParameterError(
121
+ f"Unsupported method: {params.method}. "
122
+ f"Supported methods: 'liana', 'cellphonedb', 'cellchat_r', 'fastccc'"
123
+ )
124
+
125
+ # Note: Results are already stored in adata.uns by the analysis methods
126
+ # Since ctx.get_adata() returns a reference to the stored object,
127
+ # modifications to adata.uns are automatically persisted
128
+
129
+ # Store scientific metadata for reproducibility
130
+ from ..utils.adata_utils import store_analysis_metadata
131
+
132
+ # Determine database used
133
+ if params.method == "liana":
134
+ database = params.liana_resource
135
+ elif params.method == "cellphonedb":
136
+ database = "cellphonedb"
137
+ elif params.method == "cellchat_liana":
138
+ database = (
139
+ "cellchatdb" # Match actual LIANA resource name used in implementation
140
+ )
141
+ elif params.method == "cellchat_r":
142
+ database = f"CellChatDB.{params.species}" # Native R CellChat database
143
+ elif params.method == "fastccc":
144
+ database = "fastccc_builtin" # FastCCC built-in LR database
145
+ else:
146
+ database = "unknown"
147
+
148
+ # Extract results keys
149
+ results_keys_dict = {"obs": [], "obsm": [], "uns": []}
150
+
151
+ if result_data.get("liana_results_key"):
152
+ results_keys_dict["uns"].append(result_data["liana_results_key"])
153
+ if result_data.get("liana_spatial_results_key"):
154
+ results_keys_dict["uns"].append(result_data["liana_spatial_results_key"])
155
+ if result_data.get("liana_spatial_scores_key"):
156
+ results_keys_dict["obsm"].append(result_data["liana_spatial_scores_key"])
157
+ if result_data.get("cellphonedb_results_key"):
158
+ results_keys_dict["uns"].append(result_data["cellphonedb_results_key"])
159
+ if result_data.get("cellchat_r_results_key"):
160
+ results_keys_dict["uns"].append(result_data["cellchat_r_results_key"])
161
+ if result_data.get("fastccc_results_key"):
162
+ results_keys_dict["uns"].append(result_data["fastccc_results_key"])
163
+
164
+ # Store metadata
165
+ store_analysis_metadata(
166
+ adata,
167
+ analysis_name=f"cell_communication_{params.method}",
168
+ method=params.method,
169
+ parameters={
170
+ "cell_type_key": params.cell_type_key,
171
+ "n_perms": (params.liana_n_perms if params.method == "liana" else None),
172
+ "nz_prop": (params.liana_nz_prop if params.method == "liana" else None),
173
+ "min_cells": params.min_cells,
174
+ "iterations": (
175
+ params.cellphonedb_iterations
176
+ if params.method == "cellphonedb"
177
+ else None
178
+ ),
179
+ "threshold": (
180
+ params.cellphonedb_threshold
181
+ if params.method == "cellphonedb"
182
+ else None
183
+ ),
184
+ },
185
+ results_keys=results_keys_dict,
186
+ statistics={
187
+ "n_lr_pairs": result_data["n_lr_pairs"],
188
+ "n_significant_pairs": result_data["n_significant_pairs"],
189
+ "analysis_type": result_data.get("analysis_type"),
190
+ },
191
+ species=params.species,
192
+ database=database,
193
+ )
194
+
195
+ # Create result
196
+ result = CellCommunicationResult(
197
+ data_id=data_id,
198
+ method=params.method,
199
+ species=params.species,
200
+ database=database, # Use actual database/resource determined above
201
+ n_lr_pairs=result_data["n_lr_pairs"],
202
+ n_significant_pairs=result_data["n_significant_pairs"],
203
+ global_results_key=result_data.get("global_results_key"),
204
+ top_lr_pairs=result_data.get("top_lr_pairs", []),
205
+ local_analysis_performed=result_data.get("local_analysis_performed", False),
206
+ local_results_key=result_data.get("local_results_key"),
207
+ communication_matrices_key=result_data.get("communication_matrices_key"),
208
+ liana_results_key=result_data.get("liana_results_key"),
209
+ liana_spatial_results_key=result_data.get("liana_spatial_results_key"),
210
+ liana_spatial_scores_key=result_data.get("liana_spatial_scores_key"),
211
+ analysis_type=result_data.get("analysis_type"),
212
+ patterns_identified=result_data.get("patterns_identified", False),
213
+ n_patterns=result_data.get("n_patterns"),
214
+ patterns_key=result_data.get("patterns_key"),
215
+ visualization=None, # Use visualize_data tool instead
216
+ network_visualization=None, # Use visualize_data tool instead
217
+ statistics=result_data.get("statistics", {}),
218
+ )
219
+
220
+ return result
221
+
222
+ except Exception as e:
223
+ raise ProcessingError(f"Error in cell communication analysis: {e}") from e
224
+
225
+
226
+ async def _analyze_communication_liana(
227
+ adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
228
+ ) -> dict[str, Any]:
229
+ """Analyze cell communication using LIANA+"""
230
+ # Use centralized dependency manager for consistent error handling
231
+ require("liana") # Raises ImportError with install instructions if missing
232
+ import liana as li # noqa: F401
233
+
234
+ try:
235
+ # Ensure spatial connectivity is computed
236
+ if "spatial_connectivities" not in adata.obsp:
237
+ # Use parameters from user or determine optimal bandwidth based on data size
238
+ if params.liana_bandwidth is not None:
239
+ bandwidth = params.liana_bandwidth
240
+ elif adata.n_obs > 3000:
241
+ bandwidth = 300 # Larger bandwidth for large datasets
242
+ else:
243
+ bandwidth = 200 # Standard bandwidth
244
+
245
+ # Use Squidpy for spatial neighbor computation
246
+ # Note: Spatial analysis requires spatial neighbors (physical coordinates), not expression neighbors
247
+ # Use centralized dependency manager for consistent error handling
248
+ require(
249
+ "squidpy"
250
+ ) # Raises ImportError with install instructions if missing
251
+ import squidpy as sq
252
+
253
+ # Squidpy's spatial_neighbors uses PHYSICAL coordinates
254
+ sq.gr.spatial_neighbors(
255
+ adata,
256
+ coord_type="generic",
257
+ n_neighs=min(30, max(6, adata.n_obs // 100)), # Adaptive neighbor count
258
+ radius=bandwidth if bandwidth else None,
259
+ delaunay=True, # Use Delaunay triangulation for spatial data
260
+ set_diag=False, # Standard practice for spatial graphs
261
+ )
262
+
263
+ # Validate species parameter is specified
264
+ if not params.species:
265
+ raise ParameterError(
266
+ "Species parameter is required!\n\n"
267
+ "You must explicitly specify the species of your data:\n"
268
+ " - species='human': For human data (genes like ACTB, GAPDH)\n"
269
+ " - species='mouse': For mouse data (genes like Actb, Gapdh)\n"
270
+ " - species='zebrafish': For zebrafish data\n\n"
271
+ "Example usage:\n"
272
+ " params = {\n"
273
+ " 'species': 'mouse',\n"
274
+ " 'cell_type_key': 'cell_type',\n"
275
+ " 'liana_resource': 'mouseconsensus'\n"
276
+ " }"
277
+ )
278
+
279
+ # Determine analysis type based on data characteristics
280
+ has_clusters = params.cell_type_key in adata.obs.columns
281
+
282
+ if has_clusters and not params.perform_spatial_analysis:
283
+ # Single-cell style analysis with clusters
284
+ return _run_liana_cluster_analysis(adata, params, ctx)
285
+ else:
286
+ # Spatial bivariate analysis
287
+ return _run_liana_spatial_analysis(adata, params, ctx)
288
+
289
+ except Exception as e:
290
+ raise ProcessingError(f"LIANA+ analysis failed: {e}") from e
291
+
292
+
293
+ def _get_liana_resource_name(species: str, resource_preference: str) -> str:
294
+ """Get appropriate LIANA+ resource name based on species with enhanced resource support"""
295
+ if species == "mouse":
296
+ # Mouse-specific resources
297
+ mouse_resources = ["mouseconsensus", "cellphonedb", "celltalkdb", "icellnet"]
298
+
299
+ if resource_preference == "consensus":
300
+ return "mouseconsensus" # Auto-map consensus to mouseconsensus for mouse
301
+ elif resource_preference in mouse_resources:
302
+ return (
303
+ resource_preference # Use as specified if it's a valid mouse resource
304
+ )
305
+ else:
306
+ # For non-mouse-specific resources, still use them but could warn
307
+ return resource_preference
308
+ else:
309
+ # For human or other species, use as specified
310
+ return resource_preference
311
+
312
+
313
+ def _run_liana_cluster_analysis(
314
+ adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
315
+ ) -> dict[str, Any]:
316
+ """Run LIANA+ cluster-based analysis"""
317
+ import liana as li
318
+
319
+ # Use cell_type_key from params (required field, no auto-detect)
320
+ groupby_col = params.cell_type_key
321
+
322
+ validate_obs_column(adata, groupby_col, "Cell type")
323
+
324
+ # Get appropriate resource name based on species
325
+ resource_name = _get_liana_resource_name(params.species, params.liana_resource)
326
+
327
+ # Use parameters from user (respect user choice)
328
+ n_perms = params.liana_n_perms
329
+
330
+ # Run LIANA+ rank aggregate
331
+ li.mt.rank_aggregate(
332
+ adata,
333
+ groupby=groupby_col,
334
+ resource_name=resource_name,
335
+ expr_prop=params.liana_nz_prop,
336
+ min_cells=params.min_cells,
337
+ n_perms=n_perms,
338
+ verbose=False,
339
+ use_raw=adata.raw is not None,
340
+ )
341
+
342
+ # Get results
343
+ liana_res = adata.uns["liana_res"]
344
+
345
+ # Calculate statistics using magnitude_rank (signal strength)
346
+ # NOT specificity_rank (which has non-uniform distribution)
347
+ n_lr_pairs = len(liana_res)
348
+ # Use configurable significance threshold (default: 0.05)
349
+ significance_alpha = params.liana_significance_alpha
350
+ n_significant_pairs = len(
351
+ liana_res[liana_res["magnitude_rank"] <= significance_alpha]
352
+ )
353
+
354
+ # Get top pairs using vectorized operations (faster than iterrows)
355
+ top_lr_pairs = []
356
+ detected_lr_pairs = []
357
+ if "magnitude_rank" in liana_res.columns:
358
+ top_pairs_df = liana_res.nsmallest(params.plot_top_pairs, "magnitude_rank")
359
+ # Vectorized string concatenation
360
+ ligands = top_pairs_df["ligand_complex"].values
361
+ receptors = top_pairs_df["receptor_complex"].values
362
+ top_lr_pairs = [f"{lig}_{rec}" for lig, rec in zip(ligands, receptors)]
363
+ detected_lr_pairs = list(zip(ligands, receptors))
364
+
365
+ # Store in standardized format for visualization
366
+ adata.uns["detected_lr_pairs"] = detected_lr_pairs
367
+ adata.uns["cell_communication_results"] = {
368
+ "top_lr_pairs": top_lr_pairs,
369
+ "method": "liana_cluster",
370
+ "n_pairs": len(top_lr_pairs),
371
+ "species": params.species,
372
+ }
373
+
374
+ statistics = {
375
+ "method": "liana_cluster",
376
+ "groupby": groupby_col,
377
+ "n_lr_pairs_tested": n_lr_pairs,
378
+ "n_permutations": n_perms,
379
+ "significance_threshold": significance_alpha,
380
+ "resource": params.liana_resource,
381
+ }
382
+
383
+ return {
384
+ "n_lr_pairs": n_lr_pairs,
385
+ "n_significant_pairs": n_significant_pairs,
386
+ "top_lr_pairs": top_lr_pairs,
387
+ # "liana_results_key": "liana_res", # Removed to prevent potential DataFrame serialization overflow
388
+ "analysis_type": "cluster",
389
+ "statistics": statistics,
390
+ }
391
+
392
+
393
+ def _run_liana_spatial_analysis(
394
+ adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
395
+ ) -> dict[str, Any]:
396
+ """Run LIANA+ spatial bivariate analysis"""
397
+ import liana as li
398
+
399
+ # Get appropriate resource name based on species
400
+ resource_name = _get_liana_resource_name(params.species, params.liana_resource)
401
+
402
+ # Use parameters from user (respect user choice)
403
+ n_perms = params.liana_n_perms
404
+ nz_prop = params.liana_nz_prop
405
+
406
+ # Run LIANA+ bivariate analysis
407
+ lrdata = li.mt.bivariate(
408
+ adata,
409
+ resource_name=resource_name,
410
+ local_name=params.liana_local_metric,
411
+ global_name=params.liana_global_metric,
412
+ n_perms=n_perms,
413
+ mask_negatives=False,
414
+ add_categories=True,
415
+ nz_prop=nz_prop,
416
+ use_raw=False,
417
+ verbose=False,
418
+ )
419
+
420
+ # Get results summary
421
+ n_lr_pairs = lrdata.n_vars
422
+
423
+ # Get top pairs based on global metric
424
+ global_metric = params.liana_global_metric
425
+ top_pairs_df = lrdata.var.nlargest(params.plot_top_pairs, global_metric)
426
+ top_lr_pairs = top_pairs_df.index.tolist()
427
+
428
+ # Count significant pairs using statistical significance (p-values with FDR correction)
429
+ #
430
+ # P-values are ALWAYS available because:
431
+ # 1. We always pass global_name (params.liana_global_metric, default: "morans")
432
+ # 2. We always pass n_perms > 0 (params.liana_n_perms, default: 1000, Field(gt=0))
433
+ # 3. LIANA computes p-values via permutation test when n_perms > 0
434
+ # (see liana/method/sp/_bivariate/_global_functions.py lines 104-128)
435
+ from statsmodels.stats.multitest import multipletests
436
+
437
+ pvals_col = f"{global_metric}_pvals"
438
+ alpha = params.liana_significance_alpha
439
+ pvals = lrdata.var[pvals_col]
440
+
441
+ reject, pvals_corrected, _, _ = multipletests(
442
+ pvals, alpha=alpha, method="fdr_bh" # Benjamini-Hochberg FDR correction
443
+ )
444
+
445
+ n_significant_pairs = reject.sum()
446
+
447
+ # Store corrected p-values and significance flags for downstream use
448
+ lrdata.var[f"{pvals_col}_corrected"] = pvals_corrected
449
+ lrdata.var[f"{global_metric}_significant"] = reject
450
+
451
+ # Store results in adata
452
+ adata.uns["liana_spatial_res"] = lrdata.var
453
+ adata.obsm["liana_spatial_scores"] = to_dense(lrdata.X)
454
+ adata.uns["liana_spatial_interactions"] = lrdata.var.index.tolist()
455
+
456
+ if "pvals" in lrdata.layers:
457
+ adata.obsm["liana_spatial_pvals"] = to_dense(lrdata.layers["pvals"])
458
+
459
+ if "cats" in lrdata.layers:
460
+ adata.obsm["liana_spatial_cats"] = to_dense(lrdata.layers["cats"])
461
+
462
+ # Store standardized L-R pairs for visualization
463
+ detected_lr_pairs = []
464
+ for pair_str in top_lr_pairs:
465
+ if "^" in pair_str:
466
+ ligand, receptor = pair_str.split("^", 1)
467
+ detected_lr_pairs.append((ligand, receptor))
468
+ elif "_" in pair_str:
469
+ parts = pair_str.split("_")
470
+ if len(parts) == 2:
471
+ detected_lr_pairs.append((parts[0], parts[1]))
472
+
473
+ # Store in standardized format for visualization
474
+ adata.uns["detected_lr_pairs"] = detected_lr_pairs
475
+ adata.uns["cell_communication_results"] = {
476
+ "top_lr_pairs": top_lr_pairs,
477
+ "method": "liana_spatial",
478
+ "n_pairs": len(top_lr_pairs),
479
+ "species": params.species,
480
+ }
481
+
482
+ statistics = {
483
+ "method": "liana_spatial",
484
+ "local_metric": params.liana_local_metric,
485
+ "global_metric": params.liana_global_metric,
486
+ "n_lr_pairs_tested": n_lr_pairs,
487
+ "n_permutations": n_perms,
488
+ "nz_proportion": nz_prop,
489
+ "resource": params.liana_resource,
490
+ "significance_method": (
491
+ "FDR-corrected p-values"
492
+ if pvals_col in lrdata.var.columns
493
+ else "threshold-based (deprecated)"
494
+ ),
495
+ "fdr_method": "Benjamini-Hochberg" if pvals_col in lrdata.var.columns else None,
496
+ "alpha": alpha if pvals_col in lrdata.var.columns else None,
497
+ }
498
+
499
+ return {
500
+ "n_lr_pairs": n_lr_pairs,
501
+ "n_significant_pairs": n_significant_pairs,
502
+ "top_lr_pairs": top_lr_pairs,
503
+ "liana_spatial_results_key": "liana_spatial_res",
504
+ "liana_spatial_scores_key": "liana_spatial_scores",
505
+ "analysis_type": "spatial",
506
+ "statistics": statistics,
507
+ }
508
+
509
+
510
+ def _ensure_cellphonedb_database(output_dir: str, ctx: "ToolContext") -> str:
511
+ """Ensure CellPhoneDB database is available, download if not exists"""
512
+ # Use centralized dependency manager for consistent error handling
513
+ require("cellphonedb") # Raises ImportError with install instructions if missing
514
+ import os
515
+
516
+ from cellphonedb.utils import db_utils
517
+
518
+ # Check if database file already exists
519
+ db_path = os.path.join(output_dir, "cellphonedb.zip")
520
+
521
+ if os.path.exists(db_path):
522
+ return db_path
523
+
524
+ try:
525
+ # Download latest database
526
+ db_utils.download_database(output_dir, "v5.0.0")
527
+
528
+ return db_path
529
+
530
+ except Exception as e:
531
+ error_msg = (
532
+ f"Failed to download CellPhoneDB database: {e}\n\n"
533
+ "Troubleshooting:\n"
534
+ "1. Check internet connection\n"
535
+ "2. Verify CellPhoneDB version compatibility\n"
536
+ "3. Try manually downloading database:\n"
537
+ " from cellphonedb.utils import db_utils\n"
538
+ " db_utils.download_database('/path/to/dir', 'v5.0.0')"
539
+ )
540
+ raise DependencyError(error_msg) from e
541
+
542
+
543
+ async def _analyze_communication_cellphonedb(
544
+ adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
545
+ ) -> dict[str, Any]:
546
+ """Analyze cell communication using CellPhoneDB"""
547
+ # Use centralized dependency manager for consistent error handling
548
+ require("cellphonedb") # Raises ImportError with install instructions if missing
549
+ import os
550
+ import tempfile
551
+
552
+ from cellphonedb.src.core.methods import cpdb_statistical_analysis_method
553
+
554
+ try:
555
+ import time
556
+
557
+ start_time = time.time()
558
+
559
+ # Use cell_type_key from params (required field, no auto-detect)
560
+ cell_type_col = params.cell_type_key
561
+
562
+ validate_obs_column(adata, cell_type_col, "Cell type")
563
+
564
+ # Use original adata directly (no gene filtering needed)
565
+ adata_for_analysis = adata
566
+
567
+ # Import pandas for DataFrame operations
568
+ import csv
569
+
570
+ import pandas as pd
571
+ import scipy.sparse as sp
572
+
573
+ # Check if data is sparse (used for efficient matrix access)
574
+ is_sparse = sp.issparse(adata_for_analysis.X)
575
+
576
+ # Prepare meta data
577
+ meta_df = pd.DataFrame(
578
+ {
579
+ "Cell": adata_for_analysis.obs.index,
580
+ "cell_type": adata_for_analysis.obs[cell_type_col].astype(str),
581
+ }
582
+ )
583
+
584
+ # Create microenvironments file if spatial data is available and requested
585
+ microenvs_file = None
586
+ if (
587
+ params.cellphonedb_use_microenvironments
588
+ and "spatial" in adata_for_analysis.obsm
589
+ ):
590
+ microenvs_file = await _create_microenvironments_file(
591
+ adata_for_analysis, params, ctx
592
+ )
593
+
594
+ # Set random seed for reproducibility
595
+ debug_seed = (
596
+ params.cellphonedb_debug_seed
597
+ if params.cellphonedb_debug_seed is not None
598
+ else 42
599
+ )
600
+ np.random.seed(debug_seed)
601
+
602
+ # Run CellPhoneDB statistical analysis
603
+ with tempfile.TemporaryDirectory() as temp_dir:
604
+ # Save data to temporary files
605
+ counts_file = os.path.join(temp_dir, "counts.txt")
606
+ meta_file = os.path.join(temp_dir, "meta.txt")
607
+
608
+ # Direct file writing: Stream sparse matrix to CSV without creating DataFrame
609
+ # Memory-efficient approach: write gene-by-gene instead of toarray()
610
+ with open(counts_file, "w", newline="") as f:
611
+ writer = csv.writer(f, delimiter="\t")
612
+
613
+ # Write header: empty first column + cell names
614
+ header = [""] + list(adata_for_analysis.obs_names)
615
+ writer.writerow(header)
616
+
617
+ # Convert to CSC for efficient column access (genes)
618
+ if is_sparse:
619
+ X_csc = adata_for_analysis.X.tocsc()
620
+ else:
621
+ X_csc = adata_for_analysis.X
622
+
623
+ # Write gene-by-gene (memory constant)
624
+ for i, gene_name in enumerate(adata_for_analysis.var_names):
625
+ gene_expression = to_dense(X_csc[:, i]).flatten()
626
+ writer.writerow([gene_name] + list(gene_expression))
627
+
628
+ meta_df.to_csv(meta_file, sep="\t", index=False)
629
+
630
+ try:
631
+ db_path = _ensure_cellphonedb_database(temp_dir, ctx)
632
+ except Exception as db_error:
633
+ raise DependencyError(
634
+ f"CellPhoneDB database setup failed: {db_error}"
635
+ ) from db_error
636
+
637
+ # Run the analysis using CellPhoneDB v5 API with correct parameters
638
+ try:
639
+ # STRICT: CellPhoneDB v5 ONLY - no backward compatibility for older versions
640
+ result = cpdb_statistical_analysis_method.call(
641
+ cpdb_file_path=db_path, # Fixed: Use actual database path
642
+ meta_file_path=meta_file,
643
+ counts_file_path=counts_file,
644
+ counts_data="hgnc_symbol", # Improved: Use recommended gene identifier
645
+ threshold=params.cellphonedb_threshold,
646
+ result_precision=params.cellphonedb_result_precision,
647
+ pvalue=params.cellphonedb_pvalue,
648
+ iterations=params.cellphonedb_iterations,
649
+ debug_seed=debug_seed,
650
+ output_path=temp_dir,
651
+ microenvs_file_path=microenvs_file,
652
+ score_interactions=False, # Disabled: CellPhoneDB v5 scoring has bugs
653
+ )
654
+ except KeyError as key_error:
655
+ raise ProcessingError(
656
+ f"CellPhoneDB found no L-R interactions. "
657
+ f"CellPhoneDB is human-only; use method='liana' for mouse data. "
658
+ f"Error: {key_error}"
659
+ ) from key_error
660
+ except Exception as api_error:
661
+ raise ProcessingError(
662
+ f"CellPhoneDB analysis failed: {str(api_error)}. "
663
+ f"Consider using method='liana' as alternative."
664
+ ) from api_error
665
+
666
+ # Validate CellPhoneDB v5 format
667
+ if not isinstance(result, dict):
668
+ raise ProcessingError(
669
+ f"CellPhoneDB returned unexpected format: {type(result).__name__}. "
670
+ f"Expected dict from CellPhoneDB v5. Check installation: pip install 'cellphonedb>=5.0.0'"
671
+ )
672
+
673
+ # Check for empty results (no interactions found)
674
+ if not result or "significant_means" not in result:
675
+ raise DataNotFoundError(
676
+ "CellPhoneDB found no L-R interactions. "
677
+ "CellPhoneDB is human-only; use method='liana' for mouse data."
678
+ )
679
+
680
+ # Extract results from CellPhoneDB v5 dictionary format
681
+ deconvoluted = result.get("deconvoluted")
682
+ means = result.get("means")
683
+ pvalues = result.get("pvalues")
684
+ significant_means = result.get("significant_means")
685
+
686
+ # Store results in AnnData object
687
+ adata.uns["cellphonedb_deconvoluted"] = deconvoluted
688
+ adata.uns["cellphonedb_means"] = means
689
+ adata.uns["cellphonedb_pvalues"] = pvalues
690
+ adata.uns["cellphonedb_significant_means"] = significant_means
691
+
692
+ # Calculate statistics
693
+ n_lr_pairs = (
694
+ len(means) if means is not None and hasattr(means, "__len__") else 0
695
+ )
696
+
697
+ # Filter significant pairs based on p-values
698
+ # CellPhoneDB v5 returns all pairs in 'significant_means', so manual filtering is needed
699
+ if (
700
+ pvalues is None
701
+ or not hasattr(pvalues, "values")
702
+ or means is None
703
+ or not hasattr(means, "index")
704
+ ):
705
+ raise DataNotFoundError(
706
+ "CellPhoneDB p-values unavailable - cannot identify significant interactions. "
707
+ "Try method='liana' as alternative."
708
+ )
709
+
710
+ # Filter pairs where ANY cell-cell interaction has p < threshold
711
+ # WITH multiple testing correction for cell type pairs
712
+ threshold = params.cellphonedb_pvalue
713
+ correction_method = params.cellphonedb_correction_method
714
+
715
+ # Use nanmin to find minimum p-value across all cell type pairs
716
+ # A pair is significant if its minimum p-value < threshold (after correction)
717
+ # Convert to numeric to handle any non-numeric values
718
+ pval_array = pvalues.select_dtypes(include=[np.number]).values
719
+ if pval_array.shape[0] == 0:
720
+ raise ProcessingError("CellPhoneDB p-values are not numeric.")
721
+
722
+ # Apply multiple testing correction if requested
723
+ # Correct p-values for each L-R pair across its cell type pairs to control FPR
724
+ n_cell_type_pairs = pval_array.shape[1]
725
+ n_lr_pairs_total = pval_array.shape[0]
726
+
727
+ if correction_method == "none":
728
+ # No correction: use minimum p-value (not recommended)
729
+ min_pvals = np.nanmin(pval_array, axis=1)
730
+ mask = min_pvals < threshold
731
+
732
+ await ctx.warning(
733
+ f"Multiple testing correction disabled. With {n_cell_type_pairs} cell type pairs, consider using 'fdr_bh' or 'bonferroni'."
734
+ )
735
+
736
+ # For 'none', we don't have corrected p-values per se, just use min
737
+ min_pvals_corrected = min_pvals.copy()
738
+
739
+ else:
740
+ # CORRECT APPROACH: For each L-R pair, correct its cell type pair p-values
741
+ # Then check if ANY cell type pair remains significant after correction
742
+ from statsmodels.stats.multitest import multipletests
743
+
744
+ mask = np.zeros(n_lr_pairs_total, dtype=bool)
745
+ min_pvals_corrected = np.ones(
746
+ n_lr_pairs_total
747
+ ) # Store minimum corrected p-value
748
+
749
+ n_uncorrected_sig = 0
750
+ n_corrected_sig = 0
751
+
752
+ for i in range(n_lr_pairs_total):
753
+ # Get p-values for this L-R pair across all cell type pairs
754
+ pvals_this_lr = pval_array[i, :]
755
+
756
+ # Count uncorrected significance
757
+ n_uncorrected_sig += (pvals_this_lr < threshold).any()
758
+
759
+ # Apply correction across cell type pairs for this L-R pair
760
+ reject_this_lr, pvals_corrected_this_lr, _, _ = multipletests(
761
+ pvals_this_lr,
762
+ alpha=threshold,
763
+ method=correction_method,
764
+ is_sorted=False,
765
+ returnsorted=False,
766
+ )
767
+
768
+ # This L-R pair is significant if ANY cell type pair is significant after correction
769
+ if reject_this_lr.any():
770
+ mask[i] = True
771
+ n_corrected_sig += 1
772
+
773
+ # Store minimum corrected p-value for this L-R pair
774
+ min_pvals_corrected[i] = pvals_corrected_this_lr.min()
775
+
776
+ n_significant_pairs = int(np.sum(mask))
777
+
778
+ # Store minimum corrected p-values for transparency
779
+ # Convert Series to DataFrame for H5AD compatibility (H5AD cannot store pd.Series)
780
+ adata.uns["cellphonedb_pvalues_min_corrected"] = pd.DataFrame(
781
+ {f"min_corrected_pvalue_{correction_method}": min_pvals_corrected},
782
+ index=pvalues.index.astype(str),
783
+ )
784
+
785
+ # Update stored significant_means to match filtered results
786
+ if n_significant_pairs > 0:
787
+ significant_indices = means.index[mask]
788
+ significant_means_filtered = means.loc[significant_indices]
789
+
790
+ # Update stored significant_means
791
+ adata.uns["cellphonedb_significant_means"] = significant_means_filtered
792
+
793
+ # Also update the variable for downstream use
794
+ significant_means = significant_means_filtered
795
+ else:
796
+ # No significant interactions found
797
+ await ctx.warning(
798
+ f"No significant interactions found at p < {threshold}. Consider adjusting threshold or using method='liana'."
799
+ )
800
+
801
+ # Get top LR pairs
802
+ # CellPhoneDB returns interactions in 'interacting_pair' column
803
+ top_lr_pairs = []
804
+ if (
805
+ significant_means is not None
806
+ and hasattr(significant_means, "head")
807
+ and hasattr(significant_means, "columns")
808
+ and "interacting_pair" in significant_means.columns
809
+ ):
810
+ top_pairs_df = significant_means.head(params.plot_top_pairs)
811
+ top_lr_pairs = top_pairs_df["interacting_pair"].tolist()
812
+
813
+ end_time = time.time()
814
+ analysis_time = end_time - start_time
815
+
816
+ n_cell_types = meta_df["cell_type"].nunique()
817
+ n_cell_type_pairs = n_cell_types**2
818
+
819
+ # Add correction statistics (useful for understanding results)
820
+ # When correction_method != "none", n_uncorrected_sig and n_corrected_sig
821
+ # are always defined in the else branch above (lines 1008-1009)
822
+ correction_stats = {}
823
+ if correction_method != "none":
824
+ correction_stats["n_uncorrected_significant"] = int(n_uncorrected_sig)
825
+ correction_stats["n_corrected_significant"] = int(n_corrected_sig)
826
+ if n_uncorrected_sig > 0:
827
+ correction_stats["reduction_percentage"] = round(
828
+ (1 - n_corrected_sig / n_uncorrected_sig) * 100, 2
829
+ )
830
+
831
+ statistics = {
832
+ "method": "cellphonedb",
833
+ "iterations": params.cellphonedb_iterations,
834
+ "threshold": params.cellphonedb_threshold,
835
+ "pvalue_threshold": params.cellphonedb_pvalue,
836
+ "n_cell_types": n_cell_types,
837
+ "n_cell_type_pairs": n_cell_type_pairs,
838
+ "multiple_testing_correction": correction_method,
839
+ "microenvironments_used": microenvs_file is not None,
840
+ "analysis_time_seconds": analysis_time,
841
+ }
842
+
843
+ # Add correction stats if available
844
+ if correction_stats:
845
+ statistics["correction_statistics"] = correction_stats
846
+
847
+ return {
848
+ "n_lr_pairs": n_lr_pairs,
849
+ "n_significant_pairs": n_significant_pairs,
850
+ "top_lr_pairs": top_lr_pairs,
851
+ "cellphonedb_results_key": "cellphonedb_means",
852
+ "cellphonedb_pvalues_key": "cellphonedb_pvalues",
853
+ "cellphonedb_significant_key": "cellphonedb_significant_means",
854
+ "analysis_type": "statistical",
855
+ "statistics": statistics,
856
+ }
857
+
858
+ except Exception as e:
859
+ raise ProcessingError(f"CellPhoneDB analysis failed: {e}") from e
860
+ finally:
861
+ # Cleanup: Remove temporary microenvironments file if created
862
+ if microenvs_file is not None:
863
+ try:
864
+ os.remove(microenvs_file)
865
+ except OSError:
866
+ pass # Cleanup failure is not critical
867
+
868
+
869
+ async def _create_microenvironments_file(
870
+ adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
871
+ ) -> Optional[str]:
872
+ """Create microenvironments file for CellPhoneDB spatial analysis"""
873
+ try:
874
+ import tempfile
875
+
876
+ from sklearn.neighbors import NearestNeighbors
877
+
878
+ spatial_key = get_spatial_key(adata)
879
+ if spatial_key is None:
880
+ return None
881
+
882
+ spatial_coords = adata.obsm[spatial_key]
883
+
884
+ # Determine spatial radius
885
+ if params.cellphonedb_spatial_radius is not None:
886
+ radius = params.cellphonedb_spatial_radius
887
+ else:
888
+ # Auto-determine radius based on data density
889
+ # Use median distance to 5th nearest neighbor as a heuristic
890
+ nn = NearestNeighbors(n_neighbors=6)
891
+ nn.fit(spatial_coords)
892
+ distances, _ = nn.kneighbors(spatial_coords)
893
+ radius = np.median(distances[:, 5]) * 2 # 5th neighbor (0-indexed), doubled
894
+
895
+ # Find spatial neighbors for each cell
896
+ nn = NearestNeighbors(radius=radius)
897
+ nn.fit(spatial_coords)
898
+ neighbor_matrix = nn.radius_neighbors_graph(spatial_coords)
899
+
900
+ # Create microenvironments using cell types
901
+ validate_obs_column(adata, params.cell_type_key, "Cell type")
902
+
903
+ cell_types = adata.obs[params.cell_type_key].values
904
+
905
+ # Create microenvironments by cell type co-occurrence
906
+ # Optimized: Single loop to build both mappings (2x faster)
907
+ microenv_assignments = {}
908
+ cell_type_to_microenv = {}
909
+ microenv_counter = 0
910
+
911
+ for i in range(adata.n_obs):
912
+ neighbors = neighbor_matrix[i].indices
913
+ if len(neighbors) > 1: # At least one neighbor besides itself
914
+ # Get unique cell types in this spatial neighborhood (computed once)
915
+ neighbor_cell_types = set(cell_types[j] for j in neighbors)
916
+
917
+ # Create microenvironment signature based on co-occurring cell types
918
+ microenv_signature = tuple(sorted(neighbor_cell_types))
919
+
920
+ # First use: create assignment if new signature
921
+ if microenv_signature not in microenv_assignments:
922
+ microenv_assignments[microenv_signature] = (
923
+ f"microenv_{microenv_counter}"
924
+ )
925
+ microenv_counter += 1
926
+
927
+ # Second use: update cell_type_to_microenv mappings
928
+ microenv_name = microenv_assignments[microenv_signature]
929
+ for ct in neighbor_cell_types:
930
+ if ct not in cell_type_to_microenv:
931
+ cell_type_to_microenv[ct] = set()
932
+ cell_type_to_microenv[ct].add(microenv_name)
933
+
934
+ # Create final microenvironments list (cell_type, microenvironment)
935
+ microenvs = []
936
+ for cell_type, microenv_set in cell_type_to_microenv.items():
937
+ for microenv in microenv_set:
938
+ microenvs.append([cell_type, microenv])
939
+
940
+ # Save to temporary file with CORRECT format for CellPhoneDB
941
+ temp_file = tempfile.NamedTemporaryFile(
942
+ mode="w", delete=False, suffix="_microenvironments.txt"
943
+ )
944
+ temp_file.write("cell_type\tmicroenvironment\n") # FIXED: Correct header
945
+ for cell_type, microenv in microenvs:
946
+ temp_file.write(
947
+ f"{cell_type}\t{microenv}\n"
948
+ ) # FIXED: cell_type not cell barcode
949
+ temp_file.close()
950
+
951
+ return temp_file.name
952
+
953
+ except Exception as e:
954
+ await ctx.warning(f"Failed to create microenvironments file: {e}")
955
+ return None
956
+
957
+
958
+ def _analyze_communication_cellchat_r(
959
+ adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
960
+ ) -> dict[str, Any]:
961
+ """Analyze cell communication using native R CellChat package
962
+
963
+ This implementation uses rpy2 to call the original R CellChat package,
964
+ which includes full features like mediator proteins and signaling pathways
965
+ that are not available in the LIANA simplified implementation.
966
+
967
+ Args:
968
+ adata: AnnData object with expression data
969
+ params: Cell communication analysis parameters
970
+ ctx: ToolContext for logging and data access
971
+
972
+ Returns:
973
+ Dictionary with analysis results
974
+ """
975
+ import pandas as pd
976
+ import rpy2.robjects as ro
977
+ from rpy2.robjects import numpy2ri, pandas2ri
978
+ from rpy2.robjects.conversion import localconverter
979
+
980
+ try:
981
+ import time
982
+
983
+ start_time = time.time()
984
+
985
+ # Validate cell type column
986
+ validate_obs_column(adata, params.cell_type_key, "Cell type")
987
+
988
+ # Check for spatial data
989
+ spatial_key = get_spatial_key(adata)
990
+ has_spatial = spatial_key is not None
991
+
992
+ # Prepare expression matrix (genes x cells, normalized)
993
+ # CellChat requires normalized data with comprehensive gene coverage
994
+ # Use adata.raw if available (contains all genes before HVG filtering)
995
+ if adata.raw is not None:
996
+ data_source = adata.raw
997
+ else:
998
+ data_source = adata
999
+
1000
+ # Run CellChat in R - start early to get gene list for pre-filtering
1001
+ with localconverter(
1002
+ ro.default_converter + pandas2ri.converter + numpy2ri.converter
1003
+ ):
1004
+ # Load CellChat
1005
+ ro.r("library(CellChat)")
1006
+
1007
+ # Set species-specific database
1008
+ species_db_map = {
1009
+ "human": "CellChatDB.human",
1010
+ "mouse": "CellChatDB.mouse",
1011
+ "zebrafish": "CellChatDB.zebrafish",
1012
+ }
1013
+ db_name = species_db_map.get(params.species, "CellChatDB.human")
1014
+
1015
+ # Memory optimization: Get CellChatDB gene list and pre-filter
1016
+ # This reduces memory from O(n_cells × n_all_genes) to O(n_cells × n_db_genes)
1017
+ # Typical savings: 20000 genes → 1500 genes = 13x memory reduction
1018
+ ro.r(
1019
+ f"""
1020
+ CellChatDB <- {db_name}
1021
+ # Get all genes used in CellChatDB (ligands, receptors, cofactors)
1022
+ cellchat_genes <- unique(c(
1023
+ CellChatDB$geneInfo$Symbol,
1024
+ unlist(strsplit(CellChatDB$interaction$ligand, "_")),
1025
+ unlist(strsplit(CellChatDB$interaction$receptor, "_"))
1026
+ ))
1027
+ cellchat_genes <- cellchat_genes[!is.na(cellchat_genes)]
1028
+ """
1029
+ )
1030
+ cellchat_genes_r = ro.r("cellchat_genes")
1031
+ cellchat_genes = set(cellchat_genes_r)
1032
+
1033
+ # Filter to genes present in both data and CellChatDB
1034
+ common_genes = data_source.var_names.intersection(cellchat_genes)
1035
+
1036
+ if len(common_genes) == 0:
1037
+ raise ValueError(
1038
+ f"No genes overlap between data and {db_name}. "
1039
+ f"Check if species parameter matches your data."
1040
+ )
1041
+
1042
+ # Create expression matrix with only CellChatDB genes (memory efficient)
1043
+ gene_indices = [data_source.var_names.get_loc(g) for g in common_genes]
1044
+ expr_matrix = pd.DataFrame(
1045
+ to_dense(data_source.X[:, gene_indices]).T,
1046
+ index=common_genes,
1047
+ columns=adata.obs_names,
1048
+ )
1049
+
1050
+ # Prepare metadata
1051
+ # CellChat doesn't allow labels starting with '0', so add prefix for numeric
1052
+ cell_labels = adata.obs[params.cell_type_key].astype(str).values
1053
+ # Check if any label is '0' or starts with a digit - add 'cluster_' prefix
1054
+ if any(
1055
+ label == "0" or (label and label[0].isdigit()) for label in cell_labels
1056
+ ):
1057
+ cell_labels = [f"cluster_{label}" for label in cell_labels]
1058
+ meta_df = pd.DataFrame(
1059
+ {"labels": cell_labels},
1060
+ index=adata.obs_names,
1061
+ )
1062
+
1063
+ # Prepare spatial coordinates if available
1064
+ spatial_locs = None
1065
+ if has_spatial and params.cellchat_distance_use:
1066
+ spatial_coords = adata.obsm[spatial_key]
1067
+ spatial_locs = pd.DataFrame(
1068
+ spatial_coords[:, :2],
1069
+ index=adata.obs_names,
1070
+ columns=["x", "y"],
1071
+ )
1072
+
1073
+ # Transfer data to R
1074
+ ro.globalenv["expr_matrix"] = expr_matrix
1075
+ ro.globalenv["meta_df"] = meta_df
1076
+
1077
+ # Create CellChat object (db_name already set during gene pre-filtering)
1078
+ if (
1079
+ has_spatial
1080
+ and params.cellchat_distance_use
1081
+ and spatial_locs is not None
1082
+ ):
1083
+ # Spatial mode
1084
+ ro.globalenv["spatial_locs"] = spatial_locs
1085
+
1086
+ # CellChat v2 requires spatial.factors with 'ratio' and 'tol':
1087
+ # - ratio: conversion factor from pixels to micrometers (um)
1088
+ # - tol: tolerance factor (half of spot/cell size in um)
1089
+ # Use user-configurable parameters for platform flexibility
1090
+ pixel_ratio = params.cellchat_pixel_ratio
1091
+ spatial_tol = params.cellchat_spatial_tol
1092
+ ro.globalenv["pixel_ratio"] = pixel_ratio
1093
+ ro.globalenv["spatial_tol"] = spatial_tol
1094
+ ro.r(
1095
+ """
1096
+ spatial.factors <- data.frame(
1097
+ ratio = pixel_ratio,
1098
+ tol = spatial_tol
1099
+ )
1100
+
1101
+ cellchat <- createCellChat(
1102
+ object = as.matrix(expr_matrix),
1103
+ meta = meta_df,
1104
+ group.by = "labels",
1105
+ datatype = "spatial",
1106
+ coordinates = as.matrix(spatial_locs),
1107
+ spatial.factors = spatial.factors
1108
+ )
1109
+ """
1110
+ )
1111
+ else:
1112
+ # Non-spatial mode
1113
+ ro.r(
1114
+ """
1115
+ cellchat <- createCellChat(
1116
+ object = as.matrix(expr_matrix),
1117
+ meta = meta_df,
1118
+ group.by = "labels"
1119
+ )
1120
+ """
1121
+ )
1122
+
1123
+ # Set database
1124
+ ro.r(
1125
+ f"""
1126
+ CellChatDB <- {db_name}
1127
+ """
1128
+ )
1129
+
1130
+ # Subset database by category if specified
1131
+ if params.cellchat_db_category != "All":
1132
+ ro.r(
1133
+ f"""
1134
+ CellChatDB.use <- subsetDB(
1135
+ CellChatDB,
1136
+ search = "{params.cellchat_db_category}"
1137
+ )
1138
+ cellchat@DB <- CellChatDB.use
1139
+ """
1140
+ )
1141
+ else:
1142
+ ro.r(
1143
+ """
1144
+ cellchat@DB <- CellChatDB
1145
+ """
1146
+ )
1147
+
1148
+ # Preprocessing
1149
+ ro.r(
1150
+ """
1151
+ cellchat <- subsetData(cellchat)
1152
+ cellchat <- identifyOverExpressedGenes(cellchat)
1153
+ cellchat <- identifyOverExpressedInteractions(cellchat)
1154
+ """
1155
+ )
1156
+
1157
+ # Project data (optional but recommended)
1158
+ ro.r(
1159
+ """
1160
+ # Project data onto PPI network (optional)
1161
+ tryCatch({
1162
+ cellchat <- projectData(cellchat, PPI.human)
1163
+ }, error = function(e) {
1164
+ message("Skipping data projection: ", e$message)
1165
+ })
1166
+ """
1167
+ )
1168
+
1169
+ # Compute communication probability
1170
+ if has_spatial and params.cellchat_distance_use:
1171
+ # Spatial mode with distance constraints
1172
+ # CellChat v2 requires either contact.range or contact.knn.k
1173
+ if params.cellchat_contact_range is not None:
1174
+ contact_param = f"contact.range = {params.cellchat_contact_range}"
1175
+ else:
1176
+ contact_param = f"contact.knn.k = {params.cellchat_contact_knn_k}"
1177
+
1178
+ ro.r(
1179
+ f"""
1180
+ cellchat <- computeCommunProb(
1181
+ cellchat,
1182
+ type = "{params.cellchat_type}",
1183
+ trim = {params.cellchat_trim},
1184
+ population.size = {str(params.cellchat_population_size).upper()},
1185
+ distance.use = TRUE,
1186
+ interaction.range = {params.cellchat_interaction_range},
1187
+ scale.distance = {params.cellchat_scale_distance},
1188
+ {contact_param}
1189
+ )
1190
+ """
1191
+ )
1192
+ else:
1193
+ # Non-spatial mode
1194
+ ro.r(
1195
+ f"""
1196
+ cellchat <- computeCommunProb(
1197
+ cellchat,
1198
+ type = "{params.cellchat_type}",
1199
+ trim = {params.cellchat_trim},
1200
+ population.size = {str(params.cellchat_population_size).upper()}
1201
+ )
1202
+ """
1203
+ )
1204
+
1205
+ # Filter communication
1206
+ ro.r(
1207
+ f"""
1208
+ cellchat <- filterCommunication(cellchat, min.cells = {params.cellchat_min_cells})
1209
+ """
1210
+ )
1211
+
1212
+ # Compute pathway-level communication
1213
+ ro.r(
1214
+ """
1215
+ cellchat <- computeCommunProbPathway(cellchat)
1216
+ """
1217
+ )
1218
+
1219
+ # Aggregate network
1220
+ ro.r(
1221
+ """
1222
+ cellchat <- aggregateNet(cellchat)
1223
+ """
1224
+ )
1225
+
1226
+ # Extract results
1227
+ ro.r(
1228
+ """
1229
+ # Get LR pairs
1230
+ lr_pairs <- cellchat@LR$LRsig
1231
+
1232
+ # Get communication probabilities
1233
+ net <- cellchat@net
1234
+
1235
+ # Get pathway-level probabilities
1236
+ netP <- cellchat@netP
1237
+
1238
+ # Count interactions
1239
+ n_lr_pairs <- length(unique(lr_pairs$interaction_name))
1240
+
1241
+ # Get significant pairs (probability > 0)
1242
+ prob_matrix <- net$prob
1243
+ n_significant <- sum(prob_matrix > 0, na.rm = TRUE)
1244
+
1245
+ # Get top pathways
1246
+ pathway_names <- rownames(netP$prob)
1247
+ if (length(pathway_names) > 0) {
1248
+ # Sum probabilities across cell type pairs for each pathway
1249
+ pathway_sums <- rowSums(netP$prob, na.rm = TRUE)
1250
+ top_pathway_idx <- order(pathway_sums, decreasing = TRUE)[1:min(10, length(pathway_names))]
1251
+ top_pathways <- pathway_names[top_pathway_idx]
1252
+ } else {
1253
+ top_pathways <- character(0)
1254
+ }
1255
+
1256
+ # Get top LR pairs
1257
+ if (nrow(lr_pairs) > 0) {
1258
+ top_lr <- head(lr_pairs$interaction_name, 10)
1259
+ } else {
1260
+ top_lr <- character(0)
1261
+ }
1262
+ """
1263
+ )
1264
+
1265
+ # Convert results back to Python
1266
+ n_lr_pairs = int(ro.r("n_lr_pairs")[0])
1267
+ n_significant_pairs = int(ro.r("n_significant")[0])
1268
+ top_pathways = list(ro.r("top_pathways"))
1269
+ top_lr_pairs = list(ro.r("top_lr"))
1270
+
1271
+ # Get full results for storage
1272
+ lr_pairs_df = ro.r("as.data.frame(lr_pairs)")
1273
+ prob_matrix = ro.r("as.matrix(net$prob)")
1274
+ pval_matrix = ro.r("as.matrix(net$pval)")
1275
+
1276
+ # Store in adata
1277
+ adata.uns["cellchat_r_lr_pairs"] = pd.DataFrame(lr_pairs_df)
1278
+ adata.uns["cellchat_r_prob"] = np.array(prob_matrix)
1279
+ adata.uns["cellchat_r_pval"] = np.array(pval_matrix)
1280
+ adata.uns["cellchat_r_top_pathways"] = top_pathways
1281
+ adata.uns["cellchat_r_params"] = {
1282
+ "species": params.species,
1283
+ "db_category": params.cellchat_db_category,
1284
+ "type": params.cellchat_type,
1285
+ "distance_use": params.cellchat_distance_use if has_spatial else False,
1286
+ }
1287
+
1288
+ # Store detected LR pairs in standardized format for visualization
1289
+ detected_lr_pairs = []
1290
+ for pair_str in top_lr_pairs:
1291
+ if "_" in pair_str:
1292
+ parts = pair_str.split("_", 1)
1293
+ if len(parts) == 2:
1294
+ detected_lr_pairs.append((parts[0], parts[1]))
1295
+
1296
+ adata.uns["detected_lr_pairs"] = detected_lr_pairs
1297
+ adata.uns["cell_communication_results"] = {
1298
+ "top_lr_pairs": top_lr_pairs,
1299
+ "top_pathways": top_pathways,
1300
+ "method": "cellchat_r",
1301
+ "n_pairs": len(top_lr_pairs),
1302
+ "species": params.species,
1303
+ }
1304
+
1305
+ end_time = time.time()
1306
+ analysis_time = end_time - start_time
1307
+
1308
+ statistics = {
1309
+ "method": "cellchat_r",
1310
+ "species": params.species,
1311
+ "db_category": params.cellchat_db_category,
1312
+ "aggregation_type": params.cellchat_type,
1313
+ "trim": params.cellchat_trim,
1314
+ "population_size": params.cellchat_population_size,
1315
+ "min_cells": params.cellchat_min_cells,
1316
+ "spatial_mode": has_spatial and params.cellchat_distance_use,
1317
+ "n_lr_pairs_tested": n_lr_pairs,
1318
+ "analysis_time_seconds": analysis_time,
1319
+ "top_pathways": top_pathways[:5] if top_pathways else [],
1320
+ }
1321
+
1322
+ return {
1323
+ "n_lr_pairs": n_lr_pairs,
1324
+ "n_significant_pairs": n_significant_pairs,
1325
+ "top_lr_pairs": top_lr_pairs,
1326
+ "cellchat_r_results_key": "cellchat_r_lr_pairs",
1327
+ "cellchat_r_prob_key": "cellchat_r_prob",
1328
+ "cellchat_r_pval_key": "cellchat_r_pval",
1329
+ "analysis_type": "cellchat_native",
1330
+ "statistics": statistics,
1331
+ }
1332
+
1333
+ except Exception as e:
1334
+ raise ProcessingError(f"CellChat R analysis failed: {e}") from e
1335
+
1336
+
1337
+ async def _analyze_communication_fastccc(
1338
+ adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
1339
+ ) -> dict[str, Any]:
1340
+ """Analyze cell communication using FastCCC permutation-free framework.
1341
+
1342
+ FastCCC uses FFT-based convolution to compute p-values analytically,
1343
+ making it extremely fast for large datasets (16M cells in minutes).
1344
+
1345
+ Reference: Nature Communications 2025 (https://github.com/Svvord/FastCCC)
1346
+
1347
+ Args:
1348
+ adata: AnnData object with expression data
1349
+ params: Cell communication analysis parameters
1350
+ ctx: ToolContext for logging and data access
1351
+
1352
+ Returns:
1353
+ Dictionary with analysis results
1354
+ """
1355
+ import glob
1356
+ import os
1357
+ import tempfile
1358
+ import time
1359
+
1360
+ import pandas as pd
1361
+
1362
+ from ..utils.adata_utils import to_dense
1363
+
1364
+ try:
1365
+ start_time = time.time()
1366
+
1367
+ # Import FastCCC
1368
+ if params.fastccc_use_cauchy:
1369
+ from fastccc import Cauchy_combination_of_statistical_analysis_methods
1370
+ else:
1371
+ from fastccc import statistical_analysis_method
1372
+
1373
+ # Validate cell type column
1374
+ validate_obs_column(adata, params.cell_type_key, "Cell type")
1375
+
1376
+ # Use adata.raw if available for comprehensive gene coverage
1377
+ if adata.raw is not None:
1378
+ data_source = adata.raw
1379
+ await ctx.info("Using adata.raw for comprehensive gene coverage")
1380
+ else:
1381
+ data_source = adata
1382
+
1383
+ # Create temporary directory for FastCCC I/O
1384
+ with tempfile.TemporaryDirectory() as temp_dir:
1385
+ # Save expression data as h5ad for FastCCC
1386
+ counts_file = os.path.join(temp_dir, "counts.h5ad")
1387
+
1388
+ # Create a minimal AnnData for saving (FastCCC reads h5ad directly)
1389
+ # IMPORTANT: FastCCC requires normalized log1p-transformed data
1390
+ # with max value < 14 (default threshold)
1391
+ import anndata as ad
1392
+ import scanpy as sc
1393
+
1394
+ # Prepare expression matrix (cells × genes)
1395
+ expr_matrix = to_dense(data_source.X)
1396
+ gene_names = list(data_source.var_names)
1397
+ cell_names = list(adata.obs_names)
1398
+
1399
+ # Create temporary AnnData
1400
+ temp_adata = ad.AnnData(
1401
+ X=expr_matrix.copy(),
1402
+ obs=pd.DataFrame(index=cell_names),
1403
+ var=pd.DataFrame(index=gene_names),
1404
+ )
1405
+
1406
+ # Check if data needs normalization (FastCCC max threshold is 14)
1407
+ max_val = np.max(temp_adata.X)
1408
+ if max_val > 14:
1409
+ await ctx.info(
1410
+ f"Data max value ({max_val:.1f}) exceeds FastCCC threshold (14). "
1411
+ f"Applying normalize_total + log1p transformation..."
1412
+ )
1413
+ # Apply standard scanpy normalization pipeline
1414
+ sc.pp.normalize_total(temp_adata, target_sum=1e4)
1415
+ sc.pp.log1p(temp_adata)
1416
+ new_max = np.max(temp_adata.X)
1417
+ await ctx.info(f"After normalization: max value = {new_max:.2f}")
1418
+
1419
+ # Make var names unique (FastCCC requirement)
1420
+ temp_adata.var_names_make_unique()
1421
+
1422
+ # Add cell type labels to obs
1423
+ temp_adata.obs[params.cell_type_key] = adata.obs[
1424
+ params.cell_type_key
1425
+ ].values
1426
+
1427
+ # Save to h5ad
1428
+ temp_adata.write_h5ad(counts_file)
1429
+
1430
+ # Get database directory path (FastCCC uses CellPhoneDB database format)
1431
+ # FastCCC expects a directory containing interaction_table.csv and other files
1432
+ # Check for bundled database in chatspatial package
1433
+ chatspatial_pkg_dir = os.path.dirname(os.path.dirname(__file__))
1434
+ database_dir = os.path.join(
1435
+ chatspatial_pkg_dir,
1436
+ "data",
1437
+ "cellphonedb_v5",
1438
+ "cellphonedb-data-5.0.0",
1439
+ )
1440
+
1441
+ # Verify required files exist
1442
+ required_file = os.path.join(database_dir, "interaction_table.csv")
1443
+ if not os.path.exists(required_file):
1444
+ raise ProcessingError(
1445
+ f"FastCCC requires CellPhoneDB database files. "
1446
+ f"Expected directory: {database_dir} with interaction_table.csv. "
1447
+ f"Please download from: https://github.com/ventolab/cellphonedb-data"
1448
+ )
1449
+
1450
+ # Output directory for results
1451
+ output_dir = os.path.join(temp_dir, "results")
1452
+ os.makedirs(output_dir, exist_ok=True)
1453
+
1454
+ # Run FastCCC analysis
1455
+ if params.fastccc_use_cauchy:
1456
+ # Cauchy combination method (more robust, multiple parameter combinations)
1457
+ # Note: This function saves results to files and returns None
1458
+ Cauchy_combination_of_statistical_analysis_methods(
1459
+ database_file_path=database_dir,
1460
+ celltype_file_path=None, # Using meta_key instead
1461
+ counts_file_path=counts_file,
1462
+ convert_type="hgnc_symbol",
1463
+ single_unit_summary_list=[
1464
+ "Mean",
1465
+ "Median",
1466
+ "Q3",
1467
+ "Quantile_0.9",
1468
+ ],
1469
+ complex_aggregation_list=["Minimum", "Average"],
1470
+ LR_combination_list=["Arithmetic", "Geometric"],
1471
+ min_percentile=params.fastccc_min_percentile,
1472
+ save_path=output_dir,
1473
+ meta_key=params.cell_type_key,
1474
+ use_DEG=params.fastccc_use_deg,
1475
+ )
1476
+
1477
+ # Read results from saved files (Cauchy method saves to files)
1478
+ # Find the task ID from output files
1479
+ pval_files = glob.glob(os.path.join(output_dir, "*_Cauchy_pvals.tsv"))
1480
+ if not pval_files:
1481
+ raise ProcessingError(
1482
+ "FastCCC Cauchy combination did not produce output files"
1483
+ )
1484
+
1485
+ # Extract task_id from filename
1486
+ pval_file = pval_files[0]
1487
+ task_id = os.path.basename(pval_file).replace("_Cauchy_pvals.tsv", "")
1488
+
1489
+ # Read combined results
1490
+ pvalues = pd.read_csv(pval_file, index_col=0, sep="\t")
1491
+ strength_file = os.path.join(
1492
+ output_dir, f"{task_id}_average_interactions_strength.tsv"
1493
+ )
1494
+ interactions_strength = pd.read_csv(
1495
+ strength_file, index_col=0, sep="\t"
1496
+ )
1497
+
1498
+ # Percentages are in individual method files, use first one
1499
+ pct_files = glob.glob(
1500
+ os.path.join(output_dir, f"{task_id}*percents_analysis.tsv")
1501
+ )
1502
+ if pct_files:
1503
+ percentages = pd.read_csv(pct_files[0], index_col=0, sep="\t")
1504
+ else:
1505
+ percentages = None
1506
+
1507
+ else:
1508
+ # Single method (faster)
1509
+ interactions_strength, pvalues, percentages = (
1510
+ statistical_analysis_method(
1511
+ database_file_path=database_dir,
1512
+ celltype_file_path=None, # Using meta_key instead
1513
+ counts_file_path=counts_file,
1514
+ convert_type="hgnc_symbol",
1515
+ single_unit_summary=params.fastccc_single_unit_summary,
1516
+ complex_aggregation=params.fastccc_complex_aggregation,
1517
+ LR_combination=params.fastccc_lr_combination,
1518
+ min_percentile=params.fastccc_min_percentile,
1519
+ save_path=output_dir,
1520
+ meta_key=params.cell_type_key,
1521
+ use_DEG=params.fastccc_use_deg,
1522
+ )
1523
+ )
1524
+
1525
+ # Process results
1526
+ n_lr_pairs = len(pvalues) if pvalues is not None else 0
1527
+
1528
+ # Count significant pairs
1529
+ threshold = params.fastccc_pvalue_threshold
1530
+ if pvalues is not None and hasattr(pvalues, "values"):
1531
+ # Get minimum p-value across all cell type pairs for each LR pair
1532
+ pval_array = pvalues.select_dtypes(include=[np.number]).values
1533
+ min_pvals = np.nanmin(pval_array, axis=1)
1534
+ n_significant_pairs = int(np.sum(min_pvals < threshold))
1535
+ else:
1536
+ n_significant_pairs = 0
1537
+
1538
+ # Get top LR pairs based on interaction strength
1539
+ top_lr_pairs = []
1540
+ detected_lr_pairs = []
1541
+ if interactions_strength is not None and hasattr(
1542
+ interactions_strength, "index"
1543
+ ):
1544
+ # Sort by mean interaction strength across cell type pairs
1545
+ if hasattr(interactions_strength, "select_dtypes"):
1546
+ strength_array = interactions_strength.select_dtypes(
1547
+ include=[np.number]
1548
+ ).values
1549
+ mean_strength = np.nanmean(strength_array, axis=1)
1550
+ top_indices = np.argsort(mean_strength)[::-1][: params.plot_top_pairs]
1551
+ top_lr_pairs = [interactions_strength.index[i] for i in top_indices]
1552
+
1553
+ # Parse LR pair names
1554
+ for pair_str in top_lr_pairs:
1555
+ if "_" in pair_str:
1556
+ parts = pair_str.split("_", 1)
1557
+ if len(parts) == 2:
1558
+ detected_lr_pairs.append((parts[0], parts[1]))
1559
+
1560
+ # Store results in adata
1561
+ adata.uns["fastccc_interactions_strength"] = interactions_strength
1562
+ adata.uns["fastccc_pvalues"] = pvalues
1563
+ adata.uns["fastccc_percentages"] = percentages
1564
+
1565
+ # Store standardized format for visualization
1566
+ adata.uns["detected_lr_pairs"] = detected_lr_pairs
1567
+ adata.uns["cell_communication_results"] = {
1568
+ "top_lr_pairs": top_lr_pairs,
1569
+ "method": "fastccc",
1570
+ "n_pairs": len(top_lr_pairs),
1571
+ "species": params.species,
1572
+ }
1573
+
1574
+ end_time = time.time()
1575
+ analysis_time = end_time - start_time
1576
+
1577
+ statistics = {
1578
+ "method": "fastccc",
1579
+ "species": params.species,
1580
+ "use_cauchy": params.fastccc_use_cauchy,
1581
+ "single_unit_summary": params.fastccc_single_unit_summary,
1582
+ "complex_aggregation": params.fastccc_complex_aggregation,
1583
+ "lr_combination": params.fastccc_lr_combination,
1584
+ "min_percentile": params.fastccc_min_percentile,
1585
+ "pvalue_threshold": threshold,
1586
+ "use_deg": params.fastccc_use_deg,
1587
+ "n_lr_pairs_tested": n_lr_pairs,
1588
+ "analysis_time_seconds": analysis_time,
1589
+ "permutation_free": True, # Key FastCCC feature
1590
+ }
1591
+
1592
+ return {
1593
+ "n_lr_pairs": n_lr_pairs,
1594
+ "n_significant_pairs": n_significant_pairs,
1595
+ "top_lr_pairs": top_lr_pairs,
1596
+ "fastccc_results_key": "fastccc_interactions_strength",
1597
+ "fastccc_pvalues_key": "fastccc_pvalues",
1598
+ "analysis_type": "fastccc_permutation_free",
1599
+ "statistics": statistics,
1600
+ }
1601
+
1602
+ except Exception as e:
1603
+ raise ProcessingError(f"FastCCC analysis failed: {e}") from e