chatspatial 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatspatial/__init__.py +11 -0
- chatspatial/__main__.py +141 -0
- chatspatial/cli/__init__.py +7 -0
- chatspatial/config.py +53 -0
- chatspatial/models/__init__.py +85 -0
- chatspatial/models/analysis.py +513 -0
- chatspatial/models/data.py +2462 -0
- chatspatial/server.py +1763 -0
- chatspatial/spatial_mcp_adapter.py +720 -0
- chatspatial/tools/__init__.py +3 -0
- chatspatial/tools/annotation.py +1903 -0
- chatspatial/tools/cell_communication.py +1603 -0
- chatspatial/tools/cnv_analysis.py +605 -0
- chatspatial/tools/condition_comparison.py +595 -0
- chatspatial/tools/deconvolution/__init__.py +402 -0
- chatspatial/tools/deconvolution/base.py +318 -0
- chatspatial/tools/deconvolution/card.py +244 -0
- chatspatial/tools/deconvolution/cell2location.py +326 -0
- chatspatial/tools/deconvolution/destvi.py +144 -0
- chatspatial/tools/deconvolution/flashdeconv.py +101 -0
- chatspatial/tools/deconvolution/rctd.py +317 -0
- chatspatial/tools/deconvolution/spotlight.py +216 -0
- chatspatial/tools/deconvolution/stereoscope.py +109 -0
- chatspatial/tools/deconvolution/tangram.py +135 -0
- chatspatial/tools/differential.py +625 -0
- chatspatial/tools/embeddings.py +298 -0
- chatspatial/tools/enrichment.py +1863 -0
- chatspatial/tools/integration.py +807 -0
- chatspatial/tools/preprocessing.py +723 -0
- chatspatial/tools/spatial_domains.py +808 -0
- chatspatial/tools/spatial_genes.py +836 -0
- chatspatial/tools/spatial_registration.py +441 -0
- chatspatial/tools/spatial_statistics.py +1476 -0
- chatspatial/tools/trajectory.py +495 -0
- chatspatial/tools/velocity.py +405 -0
- chatspatial/tools/visualization/__init__.py +155 -0
- chatspatial/tools/visualization/basic.py +393 -0
- chatspatial/tools/visualization/cell_comm.py +699 -0
- chatspatial/tools/visualization/cnv.py +320 -0
- chatspatial/tools/visualization/core.py +684 -0
- chatspatial/tools/visualization/deconvolution.py +852 -0
- chatspatial/tools/visualization/enrichment.py +660 -0
- chatspatial/tools/visualization/integration.py +205 -0
- chatspatial/tools/visualization/main.py +164 -0
- chatspatial/tools/visualization/multi_gene.py +739 -0
- chatspatial/tools/visualization/persistence.py +335 -0
- chatspatial/tools/visualization/spatial_stats.py +469 -0
- chatspatial/tools/visualization/trajectory.py +639 -0
- chatspatial/tools/visualization/velocity.py +411 -0
- chatspatial/utils/__init__.py +115 -0
- chatspatial/utils/adata_utils.py +1372 -0
- chatspatial/utils/compute.py +327 -0
- chatspatial/utils/data_loader.py +499 -0
- chatspatial/utils/dependency_manager.py +462 -0
- chatspatial/utils/device_utils.py +165 -0
- chatspatial/utils/exceptions.py +185 -0
- chatspatial/utils/image_utils.py +267 -0
- chatspatial/utils/mcp_utils.py +137 -0
- chatspatial/utils/path_utils.py +243 -0
- chatspatial/utils/persistence.py +78 -0
- chatspatial/utils/scipy_compat.py +143 -0
- chatspatial-1.1.0.dist-info/METADATA +242 -0
- chatspatial-1.1.0.dist-info/RECORD +67 -0
- chatspatial-1.1.0.dist-info/WHEEL +5 -0
- chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
- chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
- chatspatial-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1603 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cell-cell communication analysis tools for spatial transcriptomics data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from ..spatial_mcp_adapter import ToolContext
|
|
11
|
+
|
|
12
|
+
from ..models.analysis import CellCommunicationResult
|
|
13
|
+
from ..models.data import CellCommunicationParameters
|
|
14
|
+
from ..utils import validate_obs_column
|
|
15
|
+
from ..utils.adata_utils import get_spatial_key, to_dense
|
|
16
|
+
from ..utils.dependency_manager import require, validate_r_package
|
|
17
|
+
from ..utils.exceptions import (
|
|
18
|
+
DataNotFoundError,
|
|
19
|
+
DependencyError,
|
|
20
|
+
ParameterError,
|
|
21
|
+
ProcessingError,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def _validate_liana_requirements(
|
|
26
|
+
adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
|
|
27
|
+
) -> None:
|
|
28
|
+
"""Validate LIANA+ requirements"""
|
|
29
|
+
# Spatial connectivity validation
|
|
30
|
+
if params.perform_spatial_analysis and "spatial_connectivities" not in adata.obsp:
|
|
31
|
+
raise DataNotFoundError(
|
|
32
|
+
"Spatial connectivity required for LIANA+ bivariate analysis.\n\n"
|
|
33
|
+
"Run spatial neighbor computation first:\n"
|
|
34
|
+
" import squidpy as sq\n"
|
|
35
|
+
" sq.gr.spatial_neighbors(adata, coord_type='grid', n_rings=1)\n\n"
|
|
36
|
+
"Platform-specific recommendations:\n"
|
|
37
|
+
" Visium: coord_type='grid', n_rings=1-2\n"
|
|
38
|
+
" MERFISH: coord_type='generic', radius=20-50\n"
|
|
39
|
+
" Slide-seq: coord_type='generic', n_neighs=10-30"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Cell type validation
|
|
43
|
+
validate_obs_column(adata, params.cell_type_key, "Cell type")
|
|
44
|
+
|
|
45
|
+
# Warning for resource matching
|
|
46
|
+
if params.species == "mouse" and params.liana_resource == "consensus":
|
|
47
|
+
await ctx.warning(
|
|
48
|
+
"Using 'consensus' for mouse data. Consider liana_resource='mouseconsensus'."
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def analyze_cell_communication(
|
|
53
|
+
data_id: str,
|
|
54
|
+
ctx: "ToolContext",
|
|
55
|
+
params: CellCommunicationParameters, # No default - must be provided by caller (LLM)
|
|
56
|
+
) -> CellCommunicationResult:
|
|
57
|
+
"""Analyze cell-cell communication in spatial transcriptomics data
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
data_id: Dataset ID
|
|
61
|
+
ctx: ToolContext for data access and logging
|
|
62
|
+
params: Cell communication analysis parameters
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Cell communication analysis result
|
|
66
|
+
"""
|
|
67
|
+
# Get data via ToolContext
|
|
68
|
+
adata = await ctx.get_adata(data_id)
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
# Apply method-specific validation
|
|
72
|
+
if params.method == "liana":
|
|
73
|
+
# LIANA-based methods need spatial connectivity validation
|
|
74
|
+
await _validate_liana_requirements(adata, params, ctx)
|
|
75
|
+
elif params.method == "cellphonedb":
|
|
76
|
+
# Check if cell type column exists
|
|
77
|
+
validate_obs_column(adata, params.cell_type_key, "Cell type")
|
|
78
|
+
|
|
79
|
+
# Check for low counts
|
|
80
|
+
n_genes = adata.raw.n_vars if adata.raw is not None else adata.n_vars
|
|
81
|
+
if n_genes < 5000:
|
|
82
|
+
await ctx.warning(
|
|
83
|
+
f"Gene count ({n_genes}) is relatively low. "
|
|
84
|
+
f"This may limit the number of interactions found."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
if adata.n_obs < 100:
|
|
88
|
+
await ctx.warning(
|
|
89
|
+
f"Cell count ({adata.n_obs}) is relatively low. "
|
|
90
|
+
f"This may affect statistical power."
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Note: LIANA internally handles use_raw parameter automatically
|
|
94
|
+
# No need for manual data_source switching - consistent with other tools
|
|
95
|
+
|
|
96
|
+
# Analyze cell communication using selected method
|
|
97
|
+
if params.method == "liana":
|
|
98
|
+
require("liana", ctx, feature="LIANA+ cell communication analysis")
|
|
99
|
+
result_data = await _analyze_communication_liana(adata, params, ctx)
|
|
100
|
+
|
|
101
|
+
elif params.method == "cellphonedb":
|
|
102
|
+
require(
|
|
103
|
+
"cellphonedb", ctx, feature="CellPhoneDB cell communication analysis"
|
|
104
|
+
)
|
|
105
|
+
result_data = await _analyze_communication_cellphonedb(adata, params, ctx)
|
|
106
|
+
|
|
107
|
+
elif params.method == "cellchat_r":
|
|
108
|
+
validate_r_package(
|
|
109
|
+
"CellChat",
|
|
110
|
+
ctx,
|
|
111
|
+
install_cmd="devtools::install_github('jinworks/CellChat')",
|
|
112
|
+
)
|
|
113
|
+
result_data = _analyze_communication_cellchat_r(adata, params, ctx)
|
|
114
|
+
|
|
115
|
+
elif params.method == "fastccc":
|
|
116
|
+
require("fastccc", ctx, feature="FastCCC cell communication analysis")
|
|
117
|
+
result_data = await _analyze_communication_fastccc(adata, params, ctx)
|
|
118
|
+
|
|
119
|
+
else:
|
|
120
|
+
raise ParameterError(
|
|
121
|
+
f"Unsupported method: {params.method}. "
|
|
122
|
+
f"Supported methods: 'liana', 'cellphonedb', 'cellchat_r', 'fastccc'"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Note: Results are already stored in adata.uns by the analysis methods
|
|
126
|
+
# Since ctx.get_adata() returns a reference to the stored object,
|
|
127
|
+
# modifications to adata.uns are automatically persisted
|
|
128
|
+
|
|
129
|
+
# Store scientific metadata for reproducibility
|
|
130
|
+
from ..utils.adata_utils import store_analysis_metadata
|
|
131
|
+
|
|
132
|
+
# Determine database used
|
|
133
|
+
if params.method == "liana":
|
|
134
|
+
database = params.liana_resource
|
|
135
|
+
elif params.method == "cellphonedb":
|
|
136
|
+
database = "cellphonedb"
|
|
137
|
+
elif params.method == "cellchat_liana":
|
|
138
|
+
database = (
|
|
139
|
+
"cellchatdb" # Match actual LIANA resource name used in implementation
|
|
140
|
+
)
|
|
141
|
+
elif params.method == "cellchat_r":
|
|
142
|
+
database = f"CellChatDB.{params.species}" # Native R CellChat database
|
|
143
|
+
elif params.method == "fastccc":
|
|
144
|
+
database = "fastccc_builtin" # FastCCC built-in LR database
|
|
145
|
+
else:
|
|
146
|
+
database = "unknown"
|
|
147
|
+
|
|
148
|
+
# Extract results keys
|
|
149
|
+
results_keys_dict = {"obs": [], "obsm": [], "uns": []}
|
|
150
|
+
|
|
151
|
+
if result_data.get("liana_results_key"):
|
|
152
|
+
results_keys_dict["uns"].append(result_data["liana_results_key"])
|
|
153
|
+
if result_data.get("liana_spatial_results_key"):
|
|
154
|
+
results_keys_dict["uns"].append(result_data["liana_spatial_results_key"])
|
|
155
|
+
if result_data.get("liana_spatial_scores_key"):
|
|
156
|
+
results_keys_dict["obsm"].append(result_data["liana_spatial_scores_key"])
|
|
157
|
+
if result_data.get("cellphonedb_results_key"):
|
|
158
|
+
results_keys_dict["uns"].append(result_data["cellphonedb_results_key"])
|
|
159
|
+
if result_data.get("cellchat_r_results_key"):
|
|
160
|
+
results_keys_dict["uns"].append(result_data["cellchat_r_results_key"])
|
|
161
|
+
if result_data.get("fastccc_results_key"):
|
|
162
|
+
results_keys_dict["uns"].append(result_data["fastccc_results_key"])
|
|
163
|
+
|
|
164
|
+
# Store metadata
|
|
165
|
+
store_analysis_metadata(
|
|
166
|
+
adata,
|
|
167
|
+
analysis_name=f"cell_communication_{params.method}",
|
|
168
|
+
method=params.method,
|
|
169
|
+
parameters={
|
|
170
|
+
"cell_type_key": params.cell_type_key,
|
|
171
|
+
"n_perms": (params.liana_n_perms if params.method == "liana" else None),
|
|
172
|
+
"nz_prop": (params.liana_nz_prop if params.method == "liana" else None),
|
|
173
|
+
"min_cells": params.min_cells,
|
|
174
|
+
"iterations": (
|
|
175
|
+
params.cellphonedb_iterations
|
|
176
|
+
if params.method == "cellphonedb"
|
|
177
|
+
else None
|
|
178
|
+
),
|
|
179
|
+
"threshold": (
|
|
180
|
+
params.cellphonedb_threshold
|
|
181
|
+
if params.method == "cellphonedb"
|
|
182
|
+
else None
|
|
183
|
+
),
|
|
184
|
+
},
|
|
185
|
+
results_keys=results_keys_dict,
|
|
186
|
+
statistics={
|
|
187
|
+
"n_lr_pairs": result_data["n_lr_pairs"],
|
|
188
|
+
"n_significant_pairs": result_data["n_significant_pairs"],
|
|
189
|
+
"analysis_type": result_data.get("analysis_type"),
|
|
190
|
+
},
|
|
191
|
+
species=params.species,
|
|
192
|
+
database=database,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Create result
|
|
196
|
+
result = CellCommunicationResult(
|
|
197
|
+
data_id=data_id,
|
|
198
|
+
method=params.method,
|
|
199
|
+
species=params.species,
|
|
200
|
+
database=database, # Use actual database/resource determined above
|
|
201
|
+
n_lr_pairs=result_data["n_lr_pairs"],
|
|
202
|
+
n_significant_pairs=result_data["n_significant_pairs"],
|
|
203
|
+
global_results_key=result_data.get("global_results_key"),
|
|
204
|
+
top_lr_pairs=result_data.get("top_lr_pairs", []),
|
|
205
|
+
local_analysis_performed=result_data.get("local_analysis_performed", False),
|
|
206
|
+
local_results_key=result_data.get("local_results_key"),
|
|
207
|
+
communication_matrices_key=result_data.get("communication_matrices_key"),
|
|
208
|
+
liana_results_key=result_data.get("liana_results_key"),
|
|
209
|
+
liana_spatial_results_key=result_data.get("liana_spatial_results_key"),
|
|
210
|
+
liana_spatial_scores_key=result_data.get("liana_spatial_scores_key"),
|
|
211
|
+
analysis_type=result_data.get("analysis_type"),
|
|
212
|
+
patterns_identified=result_data.get("patterns_identified", False),
|
|
213
|
+
n_patterns=result_data.get("n_patterns"),
|
|
214
|
+
patterns_key=result_data.get("patterns_key"),
|
|
215
|
+
visualization=None, # Use visualize_data tool instead
|
|
216
|
+
network_visualization=None, # Use visualize_data tool instead
|
|
217
|
+
statistics=result_data.get("statistics", {}),
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
return result
|
|
221
|
+
|
|
222
|
+
except Exception as e:
|
|
223
|
+
raise ProcessingError(f"Error in cell communication analysis: {e}") from e
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
async def _analyze_communication_liana(
|
|
227
|
+
adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
|
|
228
|
+
) -> dict[str, Any]:
|
|
229
|
+
"""Analyze cell communication using LIANA+"""
|
|
230
|
+
# Use centralized dependency manager for consistent error handling
|
|
231
|
+
require("liana") # Raises ImportError with install instructions if missing
|
|
232
|
+
import liana as li # noqa: F401
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
# Ensure spatial connectivity is computed
|
|
236
|
+
if "spatial_connectivities" not in adata.obsp:
|
|
237
|
+
# Use parameters from user or determine optimal bandwidth based on data size
|
|
238
|
+
if params.liana_bandwidth is not None:
|
|
239
|
+
bandwidth = params.liana_bandwidth
|
|
240
|
+
elif adata.n_obs > 3000:
|
|
241
|
+
bandwidth = 300 # Larger bandwidth for large datasets
|
|
242
|
+
else:
|
|
243
|
+
bandwidth = 200 # Standard bandwidth
|
|
244
|
+
|
|
245
|
+
# Use Squidpy for spatial neighbor computation
|
|
246
|
+
# Note: Spatial analysis requires spatial neighbors (physical coordinates), not expression neighbors
|
|
247
|
+
# Use centralized dependency manager for consistent error handling
|
|
248
|
+
require(
|
|
249
|
+
"squidpy"
|
|
250
|
+
) # Raises ImportError with install instructions if missing
|
|
251
|
+
import squidpy as sq
|
|
252
|
+
|
|
253
|
+
# Squidpy's spatial_neighbors uses PHYSICAL coordinates
|
|
254
|
+
sq.gr.spatial_neighbors(
|
|
255
|
+
adata,
|
|
256
|
+
coord_type="generic",
|
|
257
|
+
n_neighs=min(30, max(6, adata.n_obs // 100)), # Adaptive neighbor count
|
|
258
|
+
radius=bandwidth if bandwidth else None,
|
|
259
|
+
delaunay=True, # Use Delaunay triangulation for spatial data
|
|
260
|
+
set_diag=False, # Standard practice for spatial graphs
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# Validate species parameter is specified
|
|
264
|
+
if not params.species:
|
|
265
|
+
raise ParameterError(
|
|
266
|
+
"Species parameter is required!\n\n"
|
|
267
|
+
"You must explicitly specify the species of your data:\n"
|
|
268
|
+
" - species='human': For human data (genes like ACTB, GAPDH)\n"
|
|
269
|
+
" - species='mouse': For mouse data (genes like Actb, Gapdh)\n"
|
|
270
|
+
" - species='zebrafish': For zebrafish data\n\n"
|
|
271
|
+
"Example usage:\n"
|
|
272
|
+
" params = {\n"
|
|
273
|
+
" 'species': 'mouse',\n"
|
|
274
|
+
" 'cell_type_key': 'cell_type',\n"
|
|
275
|
+
" 'liana_resource': 'mouseconsensus'\n"
|
|
276
|
+
" }"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Determine analysis type based on data characteristics
|
|
280
|
+
has_clusters = params.cell_type_key in adata.obs.columns
|
|
281
|
+
|
|
282
|
+
if has_clusters and not params.perform_spatial_analysis:
|
|
283
|
+
# Single-cell style analysis with clusters
|
|
284
|
+
return _run_liana_cluster_analysis(adata, params, ctx)
|
|
285
|
+
else:
|
|
286
|
+
# Spatial bivariate analysis
|
|
287
|
+
return _run_liana_spatial_analysis(adata, params, ctx)
|
|
288
|
+
|
|
289
|
+
except Exception as e:
|
|
290
|
+
raise ProcessingError(f"LIANA+ analysis failed: {e}") from e
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _get_liana_resource_name(species: str, resource_preference: str) -> str:
|
|
294
|
+
"""Get appropriate LIANA+ resource name based on species with enhanced resource support"""
|
|
295
|
+
if species == "mouse":
|
|
296
|
+
# Mouse-specific resources
|
|
297
|
+
mouse_resources = ["mouseconsensus", "cellphonedb", "celltalkdb", "icellnet"]
|
|
298
|
+
|
|
299
|
+
if resource_preference == "consensus":
|
|
300
|
+
return "mouseconsensus" # Auto-map consensus to mouseconsensus for mouse
|
|
301
|
+
elif resource_preference in mouse_resources:
|
|
302
|
+
return (
|
|
303
|
+
resource_preference # Use as specified if it's a valid mouse resource
|
|
304
|
+
)
|
|
305
|
+
else:
|
|
306
|
+
# For non-mouse-specific resources, still use them but could warn
|
|
307
|
+
return resource_preference
|
|
308
|
+
else:
|
|
309
|
+
# For human or other species, use as specified
|
|
310
|
+
return resource_preference
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _run_liana_cluster_analysis(
|
|
314
|
+
adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
|
|
315
|
+
) -> dict[str, Any]:
|
|
316
|
+
"""Run LIANA+ cluster-based analysis"""
|
|
317
|
+
import liana as li
|
|
318
|
+
|
|
319
|
+
# Use cell_type_key from params (required field, no auto-detect)
|
|
320
|
+
groupby_col = params.cell_type_key
|
|
321
|
+
|
|
322
|
+
validate_obs_column(adata, groupby_col, "Cell type")
|
|
323
|
+
|
|
324
|
+
# Get appropriate resource name based on species
|
|
325
|
+
resource_name = _get_liana_resource_name(params.species, params.liana_resource)
|
|
326
|
+
|
|
327
|
+
# Use parameters from user (respect user choice)
|
|
328
|
+
n_perms = params.liana_n_perms
|
|
329
|
+
|
|
330
|
+
# Run LIANA+ rank aggregate
|
|
331
|
+
li.mt.rank_aggregate(
|
|
332
|
+
adata,
|
|
333
|
+
groupby=groupby_col,
|
|
334
|
+
resource_name=resource_name,
|
|
335
|
+
expr_prop=params.liana_nz_prop,
|
|
336
|
+
min_cells=params.min_cells,
|
|
337
|
+
n_perms=n_perms,
|
|
338
|
+
verbose=False,
|
|
339
|
+
use_raw=adata.raw is not None,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# Get results
|
|
343
|
+
liana_res = adata.uns["liana_res"]
|
|
344
|
+
|
|
345
|
+
# Calculate statistics using magnitude_rank (signal strength)
|
|
346
|
+
# NOT specificity_rank (which has non-uniform distribution)
|
|
347
|
+
n_lr_pairs = len(liana_res)
|
|
348
|
+
# Use configurable significance threshold (default: 0.05)
|
|
349
|
+
significance_alpha = params.liana_significance_alpha
|
|
350
|
+
n_significant_pairs = len(
|
|
351
|
+
liana_res[liana_res["magnitude_rank"] <= significance_alpha]
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Get top pairs using vectorized operations (faster than iterrows)
|
|
355
|
+
top_lr_pairs = []
|
|
356
|
+
detected_lr_pairs = []
|
|
357
|
+
if "magnitude_rank" in liana_res.columns:
|
|
358
|
+
top_pairs_df = liana_res.nsmallest(params.plot_top_pairs, "magnitude_rank")
|
|
359
|
+
# Vectorized string concatenation
|
|
360
|
+
ligands = top_pairs_df["ligand_complex"].values
|
|
361
|
+
receptors = top_pairs_df["receptor_complex"].values
|
|
362
|
+
top_lr_pairs = [f"{lig}_{rec}" for lig, rec in zip(ligands, receptors)]
|
|
363
|
+
detected_lr_pairs = list(zip(ligands, receptors))
|
|
364
|
+
|
|
365
|
+
# Store in standardized format for visualization
|
|
366
|
+
adata.uns["detected_lr_pairs"] = detected_lr_pairs
|
|
367
|
+
adata.uns["cell_communication_results"] = {
|
|
368
|
+
"top_lr_pairs": top_lr_pairs,
|
|
369
|
+
"method": "liana_cluster",
|
|
370
|
+
"n_pairs": len(top_lr_pairs),
|
|
371
|
+
"species": params.species,
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
statistics = {
|
|
375
|
+
"method": "liana_cluster",
|
|
376
|
+
"groupby": groupby_col,
|
|
377
|
+
"n_lr_pairs_tested": n_lr_pairs,
|
|
378
|
+
"n_permutations": n_perms,
|
|
379
|
+
"significance_threshold": significance_alpha,
|
|
380
|
+
"resource": params.liana_resource,
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
return {
|
|
384
|
+
"n_lr_pairs": n_lr_pairs,
|
|
385
|
+
"n_significant_pairs": n_significant_pairs,
|
|
386
|
+
"top_lr_pairs": top_lr_pairs,
|
|
387
|
+
# "liana_results_key": "liana_res", # Removed to prevent potential DataFrame serialization overflow
|
|
388
|
+
"analysis_type": "cluster",
|
|
389
|
+
"statistics": statistics,
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _run_liana_spatial_analysis(
|
|
394
|
+
adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
|
|
395
|
+
) -> dict[str, Any]:
|
|
396
|
+
"""Run LIANA+ spatial bivariate analysis"""
|
|
397
|
+
import liana as li
|
|
398
|
+
|
|
399
|
+
# Get appropriate resource name based on species
|
|
400
|
+
resource_name = _get_liana_resource_name(params.species, params.liana_resource)
|
|
401
|
+
|
|
402
|
+
# Use parameters from user (respect user choice)
|
|
403
|
+
n_perms = params.liana_n_perms
|
|
404
|
+
nz_prop = params.liana_nz_prop
|
|
405
|
+
|
|
406
|
+
# Run LIANA+ bivariate analysis
|
|
407
|
+
lrdata = li.mt.bivariate(
|
|
408
|
+
adata,
|
|
409
|
+
resource_name=resource_name,
|
|
410
|
+
local_name=params.liana_local_metric,
|
|
411
|
+
global_name=params.liana_global_metric,
|
|
412
|
+
n_perms=n_perms,
|
|
413
|
+
mask_negatives=False,
|
|
414
|
+
add_categories=True,
|
|
415
|
+
nz_prop=nz_prop,
|
|
416
|
+
use_raw=False,
|
|
417
|
+
verbose=False,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
# Get results summary
|
|
421
|
+
n_lr_pairs = lrdata.n_vars
|
|
422
|
+
|
|
423
|
+
# Get top pairs based on global metric
|
|
424
|
+
global_metric = params.liana_global_metric
|
|
425
|
+
top_pairs_df = lrdata.var.nlargest(params.plot_top_pairs, global_metric)
|
|
426
|
+
top_lr_pairs = top_pairs_df.index.tolist()
|
|
427
|
+
|
|
428
|
+
# Count significant pairs using statistical significance (p-values with FDR correction)
|
|
429
|
+
#
|
|
430
|
+
# P-values are ALWAYS available because:
|
|
431
|
+
# 1. We always pass global_name (params.liana_global_metric, default: "morans")
|
|
432
|
+
# 2. We always pass n_perms > 0 (params.liana_n_perms, default: 1000, Field(gt=0))
|
|
433
|
+
# 3. LIANA computes p-values via permutation test when n_perms > 0
|
|
434
|
+
# (see liana/method/sp/_bivariate/_global_functions.py lines 104-128)
|
|
435
|
+
from statsmodels.stats.multitest import multipletests
|
|
436
|
+
|
|
437
|
+
pvals_col = f"{global_metric}_pvals"
|
|
438
|
+
alpha = params.liana_significance_alpha
|
|
439
|
+
pvals = lrdata.var[pvals_col]
|
|
440
|
+
|
|
441
|
+
reject, pvals_corrected, _, _ = multipletests(
|
|
442
|
+
pvals, alpha=alpha, method="fdr_bh" # Benjamini-Hochberg FDR correction
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
n_significant_pairs = reject.sum()
|
|
446
|
+
|
|
447
|
+
# Store corrected p-values and significance flags for downstream use
|
|
448
|
+
lrdata.var[f"{pvals_col}_corrected"] = pvals_corrected
|
|
449
|
+
lrdata.var[f"{global_metric}_significant"] = reject
|
|
450
|
+
|
|
451
|
+
# Store results in adata
|
|
452
|
+
adata.uns["liana_spatial_res"] = lrdata.var
|
|
453
|
+
adata.obsm["liana_spatial_scores"] = to_dense(lrdata.X)
|
|
454
|
+
adata.uns["liana_spatial_interactions"] = lrdata.var.index.tolist()
|
|
455
|
+
|
|
456
|
+
if "pvals" in lrdata.layers:
|
|
457
|
+
adata.obsm["liana_spatial_pvals"] = to_dense(lrdata.layers["pvals"])
|
|
458
|
+
|
|
459
|
+
if "cats" in lrdata.layers:
|
|
460
|
+
adata.obsm["liana_spatial_cats"] = to_dense(lrdata.layers["cats"])
|
|
461
|
+
|
|
462
|
+
# Store standardized L-R pairs for visualization
|
|
463
|
+
detected_lr_pairs = []
|
|
464
|
+
for pair_str in top_lr_pairs:
|
|
465
|
+
if "^" in pair_str:
|
|
466
|
+
ligand, receptor = pair_str.split("^", 1)
|
|
467
|
+
detected_lr_pairs.append((ligand, receptor))
|
|
468
|
+
elif "_" in pair_str:
|
|
469
|
+
parts = pair_str.split("_")
|
|
470
|
+
if len(parts) == 2:
|
|
471
|
+
detected_lr_pairs.append((parts[0], parts[1]))
|
|
472
|
+
|
|
473
|
+
# Store in standardized format for visualization
|
|
474
|
+
adata.uns["detected_lr_pairs"] = detected_lr_pairs
|
|
475
|
+
adata.uns["cell_communication_results"] = {
|
|
476
|
+
"top_lr_pairs": top_lr_pairs,
|
|
477
|
+
"method": "liana_spatial",
|
|
478
|
+
"n_pairs": len(top_lr_pairs),
|
|
479
|
+
"species": params.species,
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
statistics = {
|
|
483
|
+
"method": "liana_spatial",
|
|
484
|
+
"local_metric": params.liana_local_metric,
|
|
485
|
+
"global_metric": params.liana_global_metric,
|
|
486
|
+
"n_lr_pairs_tested": n_lr_pairs,
|
|
487
|
+
"n_permutations": n_perms,
|
|
488
|
+
"nz_proportion": nz_prop,
|
|
489
|
+
"resource": params.liana_resource,
|
|
490
|
+
"significance_method": (
|
|
491
|
+
"FDR-corrected p-values"
|
|
492
|
+
if pvals_col in lrdata.var.columns
|
|
493
|
+
else "threshold-based (deprecated)"
|
|
494
|
+
),
|
|
495
|
+
"fdr_method": "Benjamini-Hochberg" if pvals_col in lrdata.var.columns else None,
|
|
496
|
+
"alpha": alpha if pvals_col in lrdata.var.columns else None,
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
return {
|
|
500
|
+
"n_lr_pairs": n_lr_pairs,
|
|
501
|
+
"n_significant_pairs": n_significant_pairs,
|
|
502
|
+
"top_lr_pairs": top_lr_pairs,
|
|
503
|
+
"liana_spatial_results_key": "liana_spatial_res",
|
|
504
|
+
"liana_spatial_scores_key": "liana_spatial_scores",
|
|
505
|
+
"analysis_type": "spatial",
|
|
506
|
+
"statistics": statistics,
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def _ensure_cellphonedb_database(output_dir: str, ctx: "ToolContext") -> str:
|
|
511
|
+
"""Ensure CellPhoneDB database is available, download if not exists"""
|
|
512
|
+
# Use centralized dependency manager for consistent error handling
|
|
513
|
+
require("cellphonedb") # Raises ImportError with install instructions if missing
|
|
514
|
+
import os
|
|
515
|
+
|
|
516
|
+
from cellphonedb.utils import db_utils
|
|
517
|
+
|
|
518
|
+
# Check if database file already exists
|
|
519
|
+
db_path = os.path.join(output_dir, "cellphonedb.zip")
|
|
520
|
+
|
|
521
|
+
if os.path.exists(db_path):
|
|
522
|
+
return db_path
|
|
523
|
+
|
|
524
|
+
try:
|
|
525
|
+
# Download latest database
|
|
526
|
+
db_utils.download_database(output_dir, "v5.0.0")
|
|
527
|
+
|
|
528
|
+
return db_path
|
|
529
|
+
|
|
530
|
+
except Exception as e:
|
|
531
|
+
error_msg = (
|
|
532
|
+
f"Failed to download CellPhoneDB database: {e}\n\n"
|
|
533
|
+
"Troubleshooting:\n"
|
|
534
|
+
"1. Check internet connection\n"
|
|
535
|
+
"2. Verify CellPhoneDB version compatibility\n"
|
|
536
|
+
"3. Try manually downloading database:\n"
|
|
537
|
+
" from cellphonedb.utils import db_utils\n"
|
|
538
|
+
" db_utils.download_database('/path/to/dir', 'v5.0.0')"
|
|
539
|
+
)
|
|
540
|
+
raise DependencyError(error_msg) from e
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
async def _analyze_communication_cellphonedb(
|
|
544
|
+
adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
|
|
545
|
+
) -> dict[str, Any]:
|
|
546
|
+
"""Analyze cell communication using CellPhoneDB"""
|
|
547
|
+
# Use centralized dependency manager for consistent error handling
|
|
548
|
+
require("cellphonedb") # Raises ImportError with install instructions if missing
|
|
549
|
+
import os
|
|
550
|
+
import tempfile
|
|
551
|
+
|
|
552
|
+
from cellphonedb.src.core.methods import cpdb_statistical_analysis_method
|
|
553
|
+
|
|
554
|
+
try:
|
|
555
|
+
import time
|
|
556
|
+
|
|
557
|
+
start_time = time.time()
|
|
558
|
+
|
|
559
|
+
# Use cell_type_key from params (required field, no auto-detect)
|
|
560
|
+
cell_type_col = params.cell_type_key
|
|
561
|
+
|
|
562
|
+
validate_obs_column(adata, cell_type_col, "Cell type")
|
|
563
|
+
|
|
564
|
+
# Use original adata directly (no gene filtering needed)
|
|
565
|
+
adata_for_analysis = adata
|
|
566
|
+
|
|
567
|
+
# Import pandas for DataFrame operations
|
|
568
|
+
import csv
|
|
569
|
+
|
|
570
|
+
import pandas as pd
|
|
571
|
+
import scipy.sparse as sp
|
|
572
|
+
|
|
573
|
+
# Check if data is sparse (used for efficient matrix access)
|
|
574
|
+
is_sparse = sp.issparse(adata_for_analysis.X)
|
|
575
|
+
|
|
576
|
+
# Prepare meta data
|
|
577
|
+
meta_df = pd.DataFrame(
|
|
578
|
+
{
|
|
579
|
+
"Cell": adata_for_analysis.obs.index,
|
|
580
|
+
"cell_type": adata_for_analysis.obs[cell_type_col].astype(str),
|
|
581
|
+
}
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
# Create microenvironments file if spatial data is available and requested
|
|
585
|
+
microenvs_file = None
|
|
586
|
+
if (
|
|
587
|
+
params.cellphonedb_use_microenvironments
|
|
588
|
+
and "spatial" in adata_for_analysis.obsm
|
|
589
|
+
):
|
|
590
|
+
microenvs_file = await _create_microenvironments_file(
|
|
591
|
+
adata_for_analysis, params, ctx
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
# Set random seed for reproducibility
|
|
595
|
+
debug_seed = (
|
|
596
|
+
params.cellphonedb_debug_seed
|
|
597
|
+
if params.cellphonedb_debug_seed is not None
|
|
598
|
+
else 42
|
|
599
|
+
)
|
|
600
|
+
np.random.seed(debug_seed)
|
|
601
|
+
|
|
602
|
+
# Run CellPhoneDB statistical analysis
|
|
603
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
604
|
+
# Save data to temporary files
|
|
605
|
+
counts_file = os.path.join(temp_dir, "counts.txt")
|
|
606
|
+
meta_file = os.path.join(temp_dir, "meta.txt")
|
|
607
|
+
|
|
608
|
+
# Direct file writing: Stream sparse matrix to CSV without creating DataFrame
|
|
609
|
+
# Memory-efficient approach: write gene-by-gene instead of toarray()
|
|
610
|
+
with open(counts_file, "w", newline="") as f:
|
|
611
|
+
writer = csv.writer(f, delimiter="\t")
|
|
612
|
+
|
|
613
|
+
# Write header: empty first column + cell names
|
|
614
|
+
header = [""] + list(adata_for_analysis.obs_names)
|
|
615
|
+
writer.writerow(header)
|
|
616
|
+
|
|
617
|
+
# Convert to CSC for efficient column access (genes)
|
|
618
|
+
if is_sparse:
|
|
619
|
+
X_csc = adata_for_analysis.X.tocsc()
|
|
620
|
+
else:
|
|
621
|
+
X_csc = adata_for_analysis.X
|
|
622
|
+
|
|
623
|
+
# Write gene-by-gene (memory constant)
|
|
624
|
+
for i, gene_name in enumerate(adata_for_analysis.var_names):
|
|
625
|
+
gene_expression = to_dense(X_csc[:, i]).flatten()
|
|
626
|
+
writer.writerow([gene_name] + list(gene_expression))
|
|
627
|
+
|
|
628
|
+
meta_df.to_csv(meta_file, sep="\t", index=False)
|
|
629
|
+
|
|
630
|
+
try:
|
|
631
|
+
db_path = _ensure_cellphonedb_database(temp_dir, ctx)
|
|
632
|
+
except Exception as db_error:
|
|
633
|
+
raise DependencyError(
|
|
634
|
+
f"CellPhoneDB database setup failed: {db_error}"
|
|
635
|
+
) from db_error
|
|
636
|
+
|
|
637
|
+
# Run the analysis using CellPhoneDB v5 API with correct parameters
|
|
638
|
+
try:
|
|
639
|
+
# STRICT: CellPhoneDB v5 ONLY - no backward compatibility for older versions
|
|
640
|
+
result = cpdb_statistical_analysis_method.call(
|
|
641
|
+
cpdb_file_path=db_path, # Fixed: Use actual database path
|
|
642
|
+
meta_file_path=meta_file,
|
|
643
|
+
counts_file_path=counts_file,
|
|
644
|
+
counts_data="hgnc_symbol", # Improved: Use recommended gene identifier
|
|
645
|
+
threshold=params.cellphonedb_threshold,
|
|
646
|
+
result_precision=params.cellphonedb_result_precision,
|
|
647
|
+
pvalue=params.cellphonedb_pvalue,
|
|
648
|
+
iterations=params.cellphonedb_iterations,
|
|
649
|
+
debug_seed=debug_seed,
|
|
650
|
+
output_path=temp_dir,
|
|
651
|
+
microenvs_file_path=microenvs_file,
|
|
652
|
+
score_interactions=False, # Disabled: CellPhoneDB v5 scoring has bugs
|
|
653
|
+
)
|
|
654
|
+
except KeyError as key_error:
|
|
655
|
+
raise ProcessingError(
|
|
656
|
+
f"CellPhoneDB found no L-R interactions. "
|
|
657
|
+
f"CellPhoneDB is human-only; use method='liana' for mouse data. "
|
|
658
|
+
f"Error: {key_error}"
|
|
659
|
+
) from key_error
|
|
660
|
+
except Exception as api_error:
|
|
661
|
+
raise ProcessingError(
|
|
662
|
+
f"CellPhoneDB analysis failed: {str(api_error)}. "
|
|
663
|
+
f"Consider using method='liana' as alternative."
|
|
664
|
+
) from api_error
|
|
665
|
+
|
|
666
|
+
# Validate CellPhoneDB v5 format
|
|
667
|
+
if not isinstance(result, dict):
|
|
668
|
+
raise ProcessingError(
|
|
669
|
+
f"CellPhoneDB returned unexpected format: {type(result).__name__}. "
|
|
670
|
+
f"Expected dict from CellPhoneDB v5. Check installation: pip install 'cellphonedb>=5.0.0'"
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
# Check for empty results (no interactions found)
|
|
674
|
+
if not result or "significant_means" not in result:
|
|
675
|
+
raise DataNotFoundError(
|
|
676
|
+
"CellPhoneDB found no L-R interactions. "
|
|
677
|
+
"CellPhoneDB is human-only; use method='liana' for mouse data."
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
# Extract results from CellPhoneDB v5 dictionary format
|
|
681
|
+
deconvoluted = result.get("deconvoluted")
|
|
682
|
+
means = result.get("means")
|
|
683
|
+
pvalues = result.get("pvalues")
|
|
684
|
+
significant_means = result.get("significant_means")
|
|
685
|
+
|
|
686
|
+
# Store results in AnnData object
|
|
687
|
+
adata.uns["cellphonedb_deconvoluted"] = deconvoluted
|
|
688
|
+
adata.uns["cellphonedb_means"] = means
|
|
689
|
+
adata.uns["cellphonedb_pvalues"] = pvalues
|
|
690
|
+
adata.uns["cellphonedb_significant_means"] = significant_means
|
|
691
|
+
|
|
692
|
+
# Calculate statistics
|
|
693
|
+
n_lr_pairs = (
|
|
694
|
+
len(means) if means is not None and hasattr(means, "__len__") else 0
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
# Filter significant pairs based on p-values
|
|
698
|
+
# CellPhoneDB v5 returns all pairs in 'significant_means', so manual filtering is needed
|
|
699
|
+
if (
|
|
700
|
+
pvalues is None
|
|
701
|
+
or not hasattr(pvalues, "values")
|
|
702
|
+
or means is None
|
|
703
|
+
or not hasattr(means, "index")
|
|
704
|
+
):
|
|
705
|
+
raise DataNotFoundError(
|
|
706
|
+
"CellPhoneDB p-values unavailable - cannot identify significant interactions. "
|
|
707
|
+
"Try method='liana' as alternative."
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
# Filter pairs where ANY cell-cell interaction has p < threshold
|
|
711
|
+
# WITH multiple testing correction for cell type pairs
|
|
712
|
+
threshold = params.cellphonedb_pvalue
|
|
713
|
+
correction_method = params.cellphonedb_correction_method
|
|
714
|
+
|
|
715
|
+
# Use nanmin to find minimum p-value across all cell type pairs
|
|
716
|
+
# A pair is significant if its minimum p-value < threshold (after correction)
|
|
717
|
+
# Convert to numeric to handle any non-numeric values
|
|
718
|
+
pval_array = pvalues.select_dtypes(include=[np.number]).values
|
|
719
|
+
if pval_array.shape[0] == 0:
|
|
720
|
+
raise ProcessingError("CellPhoneDB p-values are not numeric.")
|
|
721
|
+
|
|
722
|
+
# Apply multiple testing correction if requested
|
|
723
|
+
# Correct p-values for each L-R pair across its cell type pairs to control FPR
|
|
724
|
+
n_cell_type_pairs = pval_array.shape[1]
|
|
725
|
+
n_lr_pairs_total = pval_array.shape[0]
|
|
726
|
+
|
|
727
|
+
if correction_method == "none":
|
|
728
|
+
# No correction: use minimum p-value (not recommended)
|
|
729
|
+
min_pvals = np.nanmin(pval_array, axis=1)
|
|
730
|
+
mask = min_pvals < threshold
|
|
731
|
+
|
|
732
|
+
await ctx.warning(
|
|
733
|
+
f"Multiple testing correction disabled. With {n_cell_type_pairs} cell type pairs, consider using 'fdr_bh' or 'bonferroni'."
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
# For 'none', we don't have corrected p-values per se, just use min
|
|
737
|
+
min_pvals_corrected = min_pvals.copy()
|
|
738
|
+
|
|
739
|
+
else:
|
|
740
|
+
# CORRECT APPROACH: For each L-R pair, correct its cell type pair p-values
|
|
741
|
+
# Then check if ANY cell type pair remains significant after correction
|
|
742
|
+
from statsmodels.stats.multitest import multipletests
|
|
743
|
+
|
|
744
|
+
mask = np.zeros(n_lr_pairs_total, dtype=bool)
|
|
745
|
+
min_pvals_corrected = np.ones(
|
|
746
|
+
n_lr_pairs_total
|
|
747
|
+
) # Store minimum corrected p-value
|
|
748
|
+
|
|
749
|
+
n_uncorrected_sig = 0
|
|
750
|
+
n_corrected_sig = 0
|
|
751
|
+
|
|
752
|
+
for i in range(n_lr_pairs_total):
|
|
753
|
+
# Get p-values for this L-R pair across all cell type pairs
|
|
754
|
+
pvals_this_lr = pval_array[i, :]
|
|
755
|
+
|
|
756
|
+
# Count uncorrected significance
|
|
757
|
+
n_uncorrected_sig += (pvals_this_lr < threshold).any()
|
|
758
|
+
|
|
759
|
+
# Apply correction across cell type pairs for this L-R pair
|
|
760
|
+
reject_this_lr, pvals_corrected_this_lr, _, _ = multipletests(
|
|
761
|
+
pvals_this_lr,
|
|
762
|
+
alpha=threshold,
|
|
763
|
+
method=correction_method,
|
|
764
|
+
is_sorted=False,
|
|
765
|
+
returnsorted=False,
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
# This L-R pair is significant if ANY cell type pair is significant after correction
|
|
769
|
+
if reject_this_lr.any():
|
|
770
|
+
mask[i] = True
|
|
771
|
+
n_corrected_sig += 1
|
|
772
|
+
|
|
773
|
+
# Store minimum corrected p-value for this L-R pair
|
|
774
|
+
min_pvals_corrected[i] = pvals_corrected_this_lr.min()
|
|
775
|
+
|
|
776
|
+
n_significant_pairs = int(np.sum(mask))
|
|
777
|
+
|
|
778
|
+
# Store minimum corrected p-values for transparency
|
|
779
|
+
# Convert Series to DataFrame for H5AD compatibility (H5AD cannot store pd.Series)
|
|
780
|
+
adata.uns["cellphonedb_pvalues_min_corrected"] = pd.DataFrame(
|
|
781
|
+
{f"min_corrected_pvalue_{correction_method}": min_pvals_corrected},
|
|
782
|
+
index=pvalues.index.astype(str),
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
# Update stored significant_means to match filtered results
|
|
786
|
+
if n_significant_pairs > 0:
|
|
787
|
+
significant_indices = means.index[mask]
|
|
788
|
+
significant_means_filtered = means.loc[significant_indices]
|
|
789
|
+
|
|
790
|
+
# Update stored significant_means
|
|
791
|
+
adata.uns["cellphonedb_significant_means"] = significant_means_filtered
|
|
792
|
+
|
|
793
|
+
# Also update the variable for downstream use
|
|
794
|
+
significant_means = significant_means_filtered
|
|
795
|
+
else:
|
|
796
|
+
# No significant interactions found
|
|
797
|
+
await ctx.warning(
|
|
798
|
+
f"No significant interactions found at p < {threshold}. Consider adjusting threshold or using method='liana'."
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
# Get top LR pairs
|
|
802
|
+
# CellPhoneDB returns interactions in 'interacting_pair' column
|
|
803
|
+
top_lr_pairs = []
|
|
804
|
+
if (
|
|
805
|
+
significant_means is not None
|
|
806
|
+
and hasattr(significant_means, "head")
|
|
807
|
+
and hasattr(significant_means, "columns")
|
|
808
|
+
and "interacting_pair" in significant_means.columns
|
|
809
|
+
):
|
|
810
|
+
top_pairs_df = significant_means.head(params.plot_top_pairs)
|
|
811
|
+
top_lr_pairs = top_pairs_df["interacting_pair"].tolist()
|
|
812
|
+
|
|
813
|
+
end_time = time.time()
|
|
814
|
+
analysis_time = end_time - start_time
|
|
815
|
+
|
|
816
|
+
n_cell_types = meta_df["cell_type"].nunique()
|
|
817
|
+
n_cell_type_pairs = n_cell_types**2
|
|
818
|
+
|
|
819
|
+
# Add correction statistics (useful for understanding results)
|
|
820
|
+
# When correction_method != "none", n_uncorrected_sig and n_corrected_sig
|
|
821
|
+
# are always defined in the else branch above (lines 1008-1009)
|
|
822
|
+
correction_stats = {}
|
|
823
|
+
if correction_method != "none":
|
|
824
|
+
correction_stats["n_uncorrected_significant"] = int(n_uncorrected_sig)
|
|
825
|
+
correction_stats["n_corrected_significant"] = int(n_corrected_sig)
|
|
826
|
+
if n_uncorrected_sig > 0:
|
|
827
|
+
correction_stats["reduction_percentage"] = round(
|
|
828
|
+
(1 - n_corrected_sig / n_uncorrected_sig) * 100, 2
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
statistics = {
|
|
832
|
+
"method": "cellphonedb",
|
|
833
|
+
"iterations": params.cellphonedb_iterations,
|
|
834
|
+
"threshold": params.cellphonedb_threshold,
|
|
835
|
+
"pvalue_threshold": params.cellphonedb_pvalue,
|
|
836
|
+
"n_cell_types": n_cell_types,
|
|
837
|
+
"n_cell_type_pairs": n_cell_type_pairs,
|
|
838
|
+
"multiple_testing_correction": correction_method,
|
|
839
|
+
"microenvironments_used": microenvs_file is not None,
|
|
840
|
+
"analysis_time_seconds": analysis_time,
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
# Add correction stats if available
|
|
844
|
+
if correction_stats:
|
|
845
|
+
statistics["correction_statistics"] = correction_stats
|
|
846
|
+
|
|
847
|
+
return {
|
|
848
|
+
"n_lr_pairs": n_lr_pairs,
|
|
849
|
+
"n_significant_pairs": n_significant_pairs,
|
|
850
|
+
"top_lr_pairs": top_lr_pairs,
|
|
851
|
+
"cellphonedb_results_key": "cellphonedb_means",
|
|
852
|
+
"cellphonedb_pvalues_key": "cellphonedb_pvalues",
|
|
853
|
+
"cellphonedb_significant_key": "cellphonedb_significant_means",
|
|
854
|
+
"analysis_type": "statistical",
|
|
855
|
+
"statistics": statistics,
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
except Exception as e:
|
|
859
|
+
raise ProcessingError(f"CellPhoneDB analysis failed: {e}") from e
|
|
860
|
+
finally:
|
|
861
|
+
# Cleanup: Remove temporary microenvironments file if created
|
|
862
|
+
if microenvs_file is not None:
|
|
863
|
+
try:
|
|
864
|
+
os.remove(microenvs_file)
|
|
865
|
+
except OSError:
|
|
866
|
+
pass # Cleanup failure is not critical
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
async def _create_microenvironments_file(
|
|
870
|
+
adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
|
|
871
|
+
) -> Optional[str]:
|
|
872
|
+
"""Create microenvironments file for CellPhoneDB spatial analysis"""
|
|
873
|
+
try:
|
|
874
|
+
import tempfile
|
|
875
|
+
|
|
876
|
+
from sklearn.neighbors import NearestNeighbors
|
|
877
|
+
|
|
878
|
+
spatial_key = get_spatial_key(adata)
|
|
879
|
+
if spatial_key is None:
|
|
880
|
+
return None
|
|
881
|
+
|
|
882
|
+
spatial_coords = adata.obsm[spatial_key]
|
|
883
|
+
|
|
884
|
+
# Determine spatial radius
|
|
885
|
+
if params.cellphonedb_spatial_radius is not None:
|
|
886
|
+
radius = params.cellphonedb_spatial_radius
|
|
887
|
+
else:
|
|
888
|
+
# Auto-determine radius based on data density
|
|
889
|
+
# Use median distance to 5th nearest neighbor as a heuristic
|
|
890
|
+
nn = NearestNeighbors(n_neighbors=6)
|
|
891
|
+
nn.fit(spatial_coords)
|
|
892
|
+
distances, _ = nn.kneighbors(spatial_coords)
|
|
893
|
+
radius = np.median(distances[:, 5]) * 2 # 5th neighbor (0-indexed), doubled
|
|
894
|
+
|
|
895
|
+
# Find spatial neighbors for each cell
|
|
896
|
+
nn = NearestNeighbors(radius=radius)
|
|
897
|
+
nn.fit(spatial_coords)
|
|
898
|
+
neighbor_matrix = nn.radius_neighbors_graph(spatial_coords)
|
|
899
|
+
|
|
900
|
+
# Create microenvironments using cell types
|
|
901
|
+
validate_obs_column(adata, params.cell_type_key, "Cell type")
|
|
902
|
+
|
|
903
|
+
cell_types = adata.obs[params.cell_type_key].values
|
|
904
|
+
|
|
905
|
+
# Create microenvironments by cell type co-occurrence
|
|
906
|
+
# Optimized: Single loop to build both mappings (2x faster)
|
|
907
|
+
microenv_assignments = {}
|
|
908
|
+
cell_type_to_microenv = {}
|
|
909
|
+
microenv_counter = 0
|
|
910
|
+
|
|
911
|
+
for i in range(adata.n_obs):
|
|
912
|
+
neighbors = neighbor_matrix[i].indices
|
|
913
|
+
if len(neighbors) > 1: # At least one neighbor besides itself
|
|
914
|
+
# Get unique cell types in this spatial neighborhood (computed once)
|
|
915
|
+
neighbor_cell_types = set(cell_types[j] for j in neighbors)
|
|
916
|
+
|
|
917
|
+
# Create microenvironment signature based on co-occurring cell types
|
|
918
|
+
microenv_signature = tuple(sorted(neighbor_cell_types))
|
|
919
|
+
|
|
920
|
+
# First use: create assignment if new signature
|
|
921
|
+
if microenv_signature not in microenv_assignments:
|
|
922
|
+
microenv_assignments[microenv_signature] = (
|
|
923
|
+
f"microenv_{microenv_counter}"
|
|
924
|
+
)
|
|
925
|
+
microenv_counter += 1
|
|
926
|
+
|
|
927
|
+
# Second use: update cell_type_to_microenv mappings
|
|
928
|
+
microenv_name = microenv_assignments[microenv_signature]
|
|
929
|
+
for ct in neighbor_cell_types:
|
|
930
|
+
if ct not in cell_type_to_microenv:
|
|
931
|
+
cell_type_to_microenv[ct] = set()
|
|
932
|
+
cell_type_to_microenv[ct].add(microenv_name)
|
|
933
|
+
|
|
934
|
+
# Create final microenvironments list (cell_type, microenvironment)
|
|
935
|
+
microenvs = []
|
|
936
|
+
for cell_type, microenv_set in cell_type_to_microenv.items():
|
|
937
|
+
for microenv in microenv_set:
|
|
938
|
+
microenvs.append([cell_type, microenv])
|
|
939
|
+
|
|
940
|
+
# Save to temporary file with CORRECT format for CellPhoneDB
|
|
941
|
+
temp_file = tempfile.NamedTemporaryFile(
|
|
942
|
+
mode="w", delete=False, suffix="_microenvironments.txt"
|
|
943
|
+
)
|
|
944
|
+
temp_file.write("cell_type\tmicroenvironment\n") # FIXED: Correct header
|
|
945
|
+
for cell_type, microenv in microenvs:
|
|
946
|
+
temp_file.write(
|
|
947
|
+
f"{cell_type}\t{microenv}\n"
|
|
948
|
+
) # FIXED: cell_type not cell barcode
|
|
949
|
+
temp_file.close()
|
|
950
|
+
|
|
951
|
+
return temp_file.name
|
|
952
|
+
|
|
953
|
+
except Exception as e:
|
|
954
|
+
await ctx.warning(f"Failed to create microenvironments file: {e}")
|
|
955
|
+
return None
|
|
956
|
+
|
|
957
|
+
|
|
958
|
+
def _analyze_communication_cellchat_r(
|
|
959
|
+
adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
|
|
960
|
+
) -> dict[str, Any]:
|
|
961
|
+
"""Analyze cell communication using native R CellChat package
|
|
962
|
+
|
|
963
|
+
This implementation uses rpy2 to call the original R CellChat package,
|
|
964
|
+
which includes full features like mediator proteins and signaling pathways
|
|
965
|
+
that are not available in the LIANA simplified implementation.
|
|
966
|
+
|
|
967
|
+
Args:
|
|
968
|
+
adata: AnnData object with expression data
|
|
969
|
+
params: Cell communication analysis parameters
|
|
970
|
+
ctx: ToolContext for logging and data access
|
|
971
|
+
|
|
972
|
+
Returns:
|
|
973
|
+
Dictionary with analysis results
|
|
974
|
+
"""
|
|
975
|
+
import pandas as pd
|
|
976
|
+
import rpy2.robjects as ro
|
|
977
|
+
from rpy2.robjects import numpy2ri, pandas2ri
|
|
978
|
+
from rpy2.robjects.conversion import localconverter
|
|
979
|
+
|
|
980
|
+
try:
|
|
981
|
+
import time
|
|
982
|
+
|
|
983
|
+
start_time = time.time()
|
|
984
|
+
|
|
985
|
+
# Validate cell type column
|
|
986
|
+
validate_obs_column(adata, params.cell_type_key, "Cell type")
|
|
987
|
+
|
|
988
|
+
# Check for spatial data
|
|
989
|
+
spatial_key = get_spatial_key(adata)
|
|
990
|
+
has_spatial = spatial_key is not None
|
|
991
|
+
|
|
992
|
+
# Prepare expression matrix (genes x cells, normalized)
|
|
993
|
+
# CellChat requires normalized data with comprehensive gene coverage
|
|
994
|
+
# Use adata.raw if available (contains all genes before HVG filtering)
|
|
995
|
+
if adata.raw is not None:
|
|
996
|
+
data_source = adata.raw
|
|
997
|
+
else:
|
|
998
|
+
data_source = adata
|
|
999
|
+
|
|
1000
|
+
# Run CellChat in R - start early to get gene list for pre-filtering
|
|
1001
|
+
with localconverter(
|
|
1002
|
+
ro.default_converter + pandas2ri.converter + numpy2ri.converter
|
|
1003
|
+
):
|
|
1004
|
+
# Load CellChat
|
|
1005
|
+
ro.r("library(CellChat)")
|
|
1006
|
+
|
|
1007
|
+
# Set species-specific database
|
|
1008
|
+
species_db_map = {
|
|
1009
|
+
"human": "CellChatDB.human",
|
|
1010
|
+
"mouse": "CellChatDB.mouse",
|
|
1011
|
+
"zebrafish": "CellChatDB.zebrafish",
|
|
1012
|
+
}
|
|
1013
|
+
db_name = species_db_map.get(params.species, "CellChatDB.human")
|
|
1014
|
+
|
|
1015
|
+
# Memory optimization: Get CellChatDB gene list and pre-filter
|
|
1016
|
+
# This reduces memory from O(n_cells × n_all_genes) to O(n_cells × n_db_genes)
|
|
1017
|
+
# Typical savings: 20000 genes → 1500 genes = 13x memory reduction
|
|
1018
|
+
ro.r(
|
|
1019
|
+
f"""
|
|
1020
|
+
CellChatDB <- {db_name}
|
|
1021
|
+
# Get all genes used in CellChatDB (ligands, receptors, cofactors)
|
|
1022
|
+
cellchat_genes <- unique(c(
|
|
1023
|
+
CellChatDB$geneInfo$Symbol,
|
|
1024
|
+
unlist(strsplit(CellChatDB$interaction$ligand, "_")),
|
|
1025
|
+
unlist(strsplit(CellChatDB$interaction$receptor, "_"))
|
|
1026
|
+
))
|
|
1027
|
+
cellchat_genes <- cellchat_genes[!is.na(cellchat_genes)]
|
|
1028
|
+
"""
|
|
1029
|
+
)
|
|
1030
|
+
cellchat_genes_r = ro.r("cellchat_genes")
|
|
1031
|
+
cellchat_genes = set(cellchat_genes_r)
|
|
1032
|
+
|
|
1033
|
+
# Filter to genes present in both data and CellChatDB
|
|
1034
|
+
common_genes = data_source.var_names.intersection(cellchat_genes)
|
|
1035
|
+
|
|
1036
|
+
if len(common_genes) == 0:
|
|
1037
|
+
raise ValueError(
|
|
1038
|
+
f"No genes overlap between data and {db_name}. "
|
|
1039
|
+
f"Check if species parameter matches your data."
|
|
1040
|
+
)
|
|
1041
|
+
|
|
1042
|
+
# Create expression matrix with only CellChatDB genes (memory efficient)
|
|
1043
|
+
gene_indices = [data_source.var_names.get_loc(g) for g in common_genes]
|
|
1044
|
+
expr_matrix = pd.DataFrame(
|
|
1045
|
+
to_dense(data_source.X[:, gene_indices]).T,
|
|
1046
|
+
index=common_genes,
|
|
1047
|
+
columns=adata.obs_names,
|
|
1048
|
+
)
|
|
1049
|
+
|
|
1050
|
+
# Prepare metadata
|
|
1051
|
+
# CellChat doesn't allow labels starting with '0', so add prefix for numeric
|
|
1052
|
+
cell_labels = adata.obs[params.cell_type_key].astype(str).values
|
|
1053
|
+
# Check if any label is '0' or starts with a digit - add 'cluster_' prefix
|
|
1054
|
+
if any(
|
|
1055
|
+
label == "0" or (label and label[0].isdigit()) for label in cell_labels
|
|
1056
|
+
):
|
|
1057
|
+
cell_labels = [f"cluster_{label}" for label in cell_labels]
|
|
1058
|
+
meta_df = pd.DataFrame(
|
|
1059
|
+
{"labels": cell_labels},
|
|
1060
|
+
index=adata.obs_names,
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
# Prepare spatial coordinates if available
|
|
1064
|
+
spatial_locs = None
|
|
1065
|
+
if has_spatial and params.cellchat_distance_use:
|
|
1066
|
+
spatial_coords = adata.obsm[spatial_key]
|
|
1067
|
+
spatial_locs = pd.DataFrame(
|
|
1068
|
+
spatial_coords[:, :2],
|
|
1069
|
+
index=adata.obs_names,
|
|
1070
|
+
columns=["x", "y"],
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
# Transfer data to R
|
|
1074
|
+
ro.globalenv["expr_matrix"] = expr_matrix
|
|
1075
|
+
ro.globalenv["meta_df"] = meta_df
|
|
1076
|
+
|
|
1077
|
+
# Create CellChat object (db_name already set during gene pre-filtering)
|
|
1078
|
+
if (
|
|
1079
|
+
has_spatial
|
|
1080
|
+
and params.cellchat_distance_use
|
|
1081
|
+
and spatial_locs is not None
|
|
1082
|
+
):
|
|
1083
|
+
# Spatial mode
|
|
1084
|
+
ro.globalenv["spatial_locs"] = spatial_locs
|
|
1085
|
+
|
|
1086
|
+
# CellChat v2 requires spatial.factors with 'ratio' and 'tol':
|
|
1087
|
+
# - ratio: conversion factor from pixels to micrometers (um)
|
|
1088
|
+
# - tol: tolerance factor (half of spot/cell size in um)
|
|
1089
|
+
# Use user-configurable parameters for platform flexibility
|
|
1090
|
+
pixel_ratio = params.cellchat_pixel_ratio
|
|
1091
|
+
spatial_tol = params.cellchat_spatial_tol
|
|
1092
|
+
ro.globalenv["pixel_ratio"] = pixel_ratio
|
|
1093
|
+
ro.globalenv["spatial_tol"] = spatial_tol
|
|
1094
|
+
ro.r(
|
|
1095
|
+
"""
|
|
1096
|
+
spatial.factors <- data.frame(
|
|
1097
|
+
ratio = pixel_ratio,
|
|
1098
|
+
tol = spatial_tol
|
|
1099
|
+
)
|
|
1100
|
+
|
|
1101
|
+
cellchat <- createCellChat(
|
|
1102
|
+
object = as.matrix(expr_matrix),
|
|
1103
|
+
meta = meta_df,
|
|
1104
|
+
group.by = "labels",
|
|
1105
|
+
datatype = "spatial",
|
|
1106
|
+
coordinates = as.matrix(spatial_locs),
|
|
1107
|
+
spatial.factors = spatial.factors
|
|
1108
|
+
)
|
|
1109
|
+
"""
|
|
1110
|
+
)
|
|
1111
|
+
else:
|
|
1112
|
+
# Non-spatial mode
|
|
1113
|
+
ro.r(
|
|
1114
|
+
"""
|
|
1115
|
+
cellchat <- createCellChat(
|
|
1116
|
+
object = as.matrix(expr_matrix),
|
|
1117
|
+
meta = meta_df,
|
|
1118
|
+
group.by = "labels"
|
|
1119
|
+
)
|
|
1120
|
+
"""
|
|
1121
|
+
)
|
|
1122
|
+
|
|
1123
|
+
# Set database
|
|
1124
|
+
ro.r(
|
|
1125
|
+
f"""
|
|
1126
|
+
CellChatDB <- {db_name}
|
|
1127
|
+
"""
|
|
1128
|
+
)
|
|
1129
|
+
|
|
1130
|
+
# Subset database by category if specified
|
|
1131
|
+
if params.cellchat_db_category != "All":
|
|
1132
|
+
ro.r(
|
|
1133
|
+
f"""
|
|
1134
|
+
CellChatDB.use <- subsetDB(
|
|
1135
|
+
CellChatDB,
|
|
1136
|
+
search = "{params.cellchat_db_category}"
|
|
1137
|
+
)
|
|
1138
|
+
cellchat@DB <- CellChatDB.use
|
|
1139
|
+
"""
|
|
1140
|
+
)
|
|
1141
|
+
else:
|
|
1142
|
+
ro.r(
|
|
1143
|
+
"""
|
|
1144
|
+
cellchat@DB <- CellChatDB
|
|
1145
|
+
"""
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
# Preprocessing
|
|
1149
|
+
ro.r(
|
|
1150
|
+
"""
|
|
1151
|
+
cellchat <- subsetData(cellchat)
|
|
1152
|
+
cellchat <- identifyOverExpressedGenes(cellchat)
|
|
1153
|
+
cellchat <- identifyOverExpressedInteractions(cellchat)
|
|
1154
|
+
"""
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
# Project data (optional but recommended)
|
|
1158
|
+
ro.r(
|
|
1159
|
+
"""
|
|
1160
|
+
# Project data onto PPI network (optional)
|
|
1161
|
+
tryCatch({
|
|
1162
|
+
cellchat <- projectData(cellchat, PPI.human)
|
|
1163
|
+
}, error = function(e) {
|
|
1164
|
+
message("Skipping data projection: ", e$message)
|
|
1165
|
+
})
|
|
1166
|
+
"""
|
|
1167
|
+
)
|
|
1168
|
+
|
|
1169
|
+
# Compute communication probability
|
|
1170
|
+
if has_spatial and params.cellchat_distance_use:
|
|
1171
|
+
# Spatial mode with distance constraints
|
|
1172
|
+
# CellChat v2 requires either contact.range or contact.knn.k
|
|
1173
|
+
if params.cellchat_contact_range is not None:
|
|
1174
|
+
contact_param = f"contact.range = {params.cellchat_contact_range}"
|
|
1175
|
+
else:
|
|
1176
|
+
contact_param = f"contact.knn.k = {params.cellchat_contact_knn_k}"
|
|
1177
|
+
|
|
1178
|
+
ro.r(
|
|
1179
|
+
f"""
|
|
1180
|
+
cellchat <- computeCommunProb(
|
|
1181
|
+
cellchat,
|
|
1182
|
+
type = "{params.cellchat_type}",
|
|
1183
|
+
trim = {params.cellchat_trim},
|
|
1184
|
+
population.size = {str(params.cellchat_population_size).upper()},
|
|
1185
|
+
distance.use = TRUE,
|
|
1186
|
+
interaction.range = {params.cellchat_interaction_range},
|
|
1187
|
+
scale.distance = {params.cellchat_scale_distance},
|
|
1188
|
+
{contact_param}
|
|
1189
|
+
)
|
|
1190
|
+
"""
|
|
1191
|
+
)
|
|
1192
|
+
else:
|
|
1193
|
+
# Non-spatial mode
|
|
1194
|
+
ro.r(
|
|
1195
|
+
f"""
|
|
1196
|
+
cellchat <- computeCommunProb(
|
|
1197
|
+
cellchat,
|
|
1198
|
+
type = "{params.cellchat_type}",
|
|
1199
|
+
trim = {params.cellchat_trim},
|
|
1200
|
+
population.size = {str(params.cellchat_population_size).upper()}
|
|
1201
|
+
)
|
|
1202
|
+
"""
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1205
|
+
# Filter communication
|
|
1206
|
+
ro.r(
|
|
1207
|
+
f"""
|
|
1208
|
+
cellchat <- filterCommunication(cellchat, min.cells = {params.cellchat_min_cells})
|
|
1209
|
+
"""
|
|
1210
|
+
)
|
|
1211
|
+
|
|
1212
|
+
# Compute pathway-level communication
|
|
1213
|
+
ro.r(
|
|
1214
|
+
"""
|
|
1215
|
+
cellchat <- computeCommunProbPathway(cellchat)
|
|
1216
|
+
"""
|
|
1217
|
+
)
|
|
1218
|
+
|
|
1219
|
+
# Aggregate network
|
|
1220
|
+
ro.r(
|
|
1221
|
+
"""
|
|
1222
|
+
cellchat <- aggregateNet(cellchat)
|
|
1223
|
+
"""
|
|
1224
|
+
)
|
|
1225
|
+
|
|
1226
|
+
# Extract results
|
|
1227
|
+
ro.r(
|
|
1228
|
+
"""
|
|
1229
|
+
# Get LR pairs
|
|
1230
|
+
lr_pairs <- cellchat@LR$LRsig
|
|
1231
|
+
|
|
1232
|
+
# Get communication probabilities
|
|
1233
|
+
net <- cellchat@net
|
|
1234
|
+
|
|
1235
|
+
# Get pathway-level probabilities
|
|
1236
|
+
netP <- cellchat@netP
|
|
1237
|
+
|
|
1238
|
+
# Count interactions
|
|
1239
|
+
n_lr_pairs <- length(unique(lr_pairs$interaction_name))
|
|
1240
|
+
|
|
1241
|
+
# Get significant pairs (probability > 0)
|
|
1242
|
+
prob_matrix <- net$prob
|
|
1243
|
+
n_significant <- sum(prob_matrix > 0, na.rm = TRUE)
|
|
1244
|
+
|
|
1245
|
+
# Get top pathways
|
|
1246
|
+
pathway_names <- rownames(netP$prob)
|
|
1247
|
+
if (length(pathway_names) > 0) {
|
|
1248
|
+
# Sum probabilities across cell type pairs for each pathway
|
|
1249
|
+
pathway_sums <- rowSums(netP$prob, na.rm = TRUE)
|
|
1250
|
+
top_pathway_idx <- order(pathway_sums, decreasing = TRUE)[1:min(10, length(pathway_names))]
|
|
1251
|
+
top_pathways <- pathway_names[top_pathway_idx]
|
|
1252
|
+
} else {
|
|
1253
|
+
top_pathways <- character(0)
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
# Get top LR pairs
|
|
1257
|
+
if (nrow(lr_pairs) > 0) {
|
|
1258
|
+
top_lr <- head(lr_pairs$interaction_name, 10)
|
|
1259
|
+
} else {
|
|
1260
|
+
top_lr <- character(0)
|
|
1261
|
+
}
|
|
1262
|
+
"""
|
|
1263
|
+
)
|
|
1264
|
+
|
|
1265
|
+
# Convert results back to Python
|
|
1266
|
+
n_lr_pairs = int(ro.r("n_lr_pairs")[0])
|
|
1267
|
+
n_significant_pairs = int(ro.r("n_significant")[0])
|
|
1268
|
+
top_pathways = list(ro.r("top_pathways"))
|
|
1269
|
+
top_lr_pairs = list(ro.r("top_lr"))
|
|
1270
|
+
|
|
1271
|
+
# Get full results for storage
|
|
1272
|
+
lr_pairs_df = ro.r("as.data.frame(lr_pairs)")
|
|
1273
|
+
prob_matrix = ro.r("as.matrix(net$prob)")
|
|
1274
|
+
pval_matrix = ro.r("as.matrix(net$pval)")
|
|
1275
|
+
|
|
1276
|
+
# Store in adata
|
|
1277
|
+
adata.uns["cellchat_r_lr_pairs"] = pd.DataFrame(lr_pairs_df)
|
|
1278
|
+
adata.uns["cellchat_r_prob"] = np.array(prob_matrix)
|
|
1279
|
+
adata.uns["cellchat_r_pval"] = np.array(pval_matrix)
|
|
1280
|
+
adata.uns["cellchat_r_top_pathways"] = top_pathways
|
|
1281
|
+
adata.uns["cellchat_r_params"] = {
|
|
1282
|
+
"species": params.species,
|
|
1283
|
+
"db_category": params.cellchat_db_category,
|
|
1284
|
+
"type": params.cellchat_type,
|
|
1285
|
+
"distance_use": params.cellchat_distance_use if has_spatial else False,
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
# Store detected LR pairs in standardized format for visualization
|
|
1289
|
+
detected_lr_pairs = []
|
|
1290
|
+
for pair_str in top_lr_pairs:
|
|
1291
|
+
if "_" in pair_str:
|
|
1292
|
+
parts = pair_str.split("_", 1)
|
|
1293
|
+
if len(parts) == 2:
|
|
1294
|
+
detected_lr_pairs.append((parts[0], parts[1]))
|
|
1295
|
+
|
|
1296
|
+
adata.uns["detected_lr_pairs"] = detected_lr_pairs
|
|
1297
|
+
adata.uns["cell_communication_results"] = {
|
|
1298
|
+
"top_lr_pairs": top_lr_pairs,
|
|
1299
|
+
"top_pathways": top_pathways,
|
|
1300
|
+
"method": "cellchat_r",
|
|
1301
|
+
"n_pairs": len(top_lr_pairs),
|
|
1302
|
+
"species": params.species,
|
|
1303
|
+
}
|
|
1304
|
+
|
|
1305
|
+
end_time = time.time()
|
|
1306
|
+
analysis_time = end_time - start_time
|
|
1307
|
+
|
|
1308
|
+
statistics = {
|
|
1309
|
+
"method": "cellchat_r",
|
|
1310
|
+
"species": params.species,
|
|
1311
|
+
"db_category": params.cellchat_db_category,
|
|
1312
|
+
"aggregation_type": params.cellchat_type,
|
|
1313
|
+
"trim": params.cellchat_trim,
|
|
1314
|
+
"population_size": params.cellchat_population_size,
|
|
1315
|
+
"min_cells": params.cellchat_min_cells,
|
|
1316
|
+
"spatial_mode": has_spatial and params.cellchat_distance_use,
|
|
1317
|
+
"n_lr_pairs_tested": n_lr_pairs,
|
|
1318
|
+
"analysis_time_seconds": analysis_time,
|
|
1319
|
+
"top_pathways": top_pathways[:5] if top_pathways else [],
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
return {
|
|
1323
|
+
"n_lr_pairs": n_lr_pairs,
|
|
1324
|
+
"n_significant_pairs": n_significant_pairs,
|
|
1325
|
+
"top_lr_pairs": top_lr_pairs,
|
|
1326
|
+
"cellchat_r_results_key": "cellchat_r_lr_pairs",
|
|
1327
|
+
"cellchat_r_prob_key": "cellchat_r_prob",
|
|
1328
|
+
"cellchat_r_pval_key": "cellchat_r_pval",
|
|
1329
|
+
"analysis_type": "cellchat_native",
|
|
1330
|
+
"statistics": statistics,
|
|
1331
|
+
}
|
|
1332
|
+
|
|
1333
|
+
except Exception as e:
|
|
1334
|
+
raise ProcessingError(f"CellChat R analysis failed: {e}") from e
|
|
1335
|
+
|
|
1336
|
+
|
|
1337
|
+
async def _analyze_communication_fastccc(
|
|
1338
|
+
adata: Any, params: CellCommunicationParameters, ctx: "ToolContext"
|
|
1339
|
+
) -> dict[str, Any]:
|
|
1340
|
+
"""Analyze cell communication using FastCCC permutation-free framework.
|
|
1341
|
+
|
|
1342
|
+
FastCCC uses FFT-based convolution to compute p-values analytically,
|
|
1343
|
+
making it extremely fast for large datasets (16M cells in minutes).
|
|
1344
|
+
|
|
1345
|
+
Reference: Nature Communications 2025 (https://github.com/Svvord/FastCCC)
|
|
1346
|
+
|
|
1347
|
+
Args:
|
|
1348
|
+
adata: AnnData object with expression data
|
|
1349
|
+
params: Cell communication analysis parameters
|
|
1350
|
+
ctx: ToolContext for logging and data access
|
|
1351
|
+
|
|
1352
|
+
Returns:
|
|
1353
|
+
Dictionary with analysis results
|
|
1354
|
+
"""
|
|
1355
|
+
import glob
|
|
1356
|
+
import os
|
|
1357
|
+
import tempfile
|
|
1358
|
+
import time
|
|
1359
|
+
|
|
1360
|
+
import pandas as pd
|
|
1361
|
+
|
|
1362
|
+
from ..utils.adata_utils import to_dense
|
|
1363
|
+
|
|
1364
|
+
try:
|
|
1365
|
+
start_time = time.time()
|
|
1366
|
+
|
|
1367
|
+
# Import FastCCC
|
|
1368
|
+
if params.fastccc_use_cauchy:
|
|
1369
|
+
from fastccc import Cauchy_combination_of_statistical_analysis_methods
|
|
1370
|
+
else:
|
|
1371
|
+
from fastccc import statistical_analysis_method
|
|
1372
|
+
|
|
1373
|
+
# Validate cell type column
|
|
1374
|
+
validate_obs_column(adata, params.cell_type_key, "Cell type")
|
|
1375
|
+
|
|
1376
|
+
# Use adata.raw if available for comprehensive gene coverage
|
|
1377
|
+
if adata.raw is not None:
|
|
1378
|
+
data_source = adata.raw
|
|
1379
|
+
await ctx.info("Using adata.raw for comprehensive gene coverage")
|
|
1380
|
+
else:
|
|
1381
|
+
data_source = adata
|
|
1382
|
+
|
|
1383
|
+
# Create temporary directory for FastCCC I/O
|
|
1384
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
1385
|
+
# Save expression data as h5ad for FastCCC
|
|
1386
|
+
counts_file = os.path.join(temp_dir, "counts.h5ad")
|
|
1387
|
+
|
|
1388
|
+
# Create a minimal AnnData for saving (FastCCC reads h5ad directly)
|
|
1389
|
+
# IMPORTANT: FastCCC requires normalized log1p-transformed data
|
|
1390
|
+
# with max value < 14 (default threshold)
|
|
1391
|
+
import anndata as ad
|
|
1392
|
+
import scanpy as sc
|
|
1393
|
+
|
|
1394
|
+
# Prepare expression matrix (cells × genes)
|
|
1395
|
+
expr_matrix = to_dense(data_source.X)
|
|
1396
|
+
gene_names = list(data_source.var_names)
|
|
1397
|
+
cell_names = list(adata.obs_names)
|
|
1398
|
+
|
|
1399
|
+
# Create temporary AnnData
|
|
1400
|
+
temp_adata = ad.AnnData(
|
|
1401
|
+
X=expr_matrix.copy(),
|
|
1402
|
+
obs=pd.DataFrame(index=cell_names),
|
|
1403
|
+
var=pd.DataFrame(index=gene_names),
|
|
1404
|
+
)
|
|
1405
|
+
|
|
1406
|
+
# Check if data needs normalization (FastCCC max threshold is 14)
|
|
1407
|
+
max_val = np.max(temp_adata.X)
|
|
1408
|
+
if max_val > 14:
|
|
1409
|
+
await ctx.info(
|
|
1410
|
+
f"Data max value ({max_val:.1f}) exceeds FastCCC threshold (14). "
|
|
1411
|
+
f"Applying normalize_total + log1p transformation..."
|
|
1412
|
+
)
|
|
1413
|
+
# Apply standard scanpy normalization pipeline
|
|
1414
|
+
sc.pp.normalize_total(temp_adata, target_sum=1e4)
|
|
1415
|
+
sc.pp.log1p(temp_adata)
|
|
1416
|
+
new_max = np.max(temp_adata.X)
|
|
1417
|
+
await ctx.info(f"After normalization: max value = {new_max:.2f}")
|
|
1418
|
+
|
|
1419
|
+
# Make var names unique (FastCCC requirement)
|
|
1420
|
+
temp_adata.var_names_make_unique()
|
|
1421
|
+
|
|
1422
|
+
# Add cell type labels to obs
|
|
1423
|
+
temp_adata.obs[params.cell_type_key] = adata.obs[
|
|
1424
|
+
params.cell_type_key
|
|
1425
|
+
].values
|
|
1426
|
+
|
|
1427
|
+
# Save to h5ad
|
|
1428
|
+
temp_adata.write_h5ad(counts_file)
|
|
1429
|
+
|
|
1430
|
+
# Get database directory path (FastCCC uses CellPhoneDB database format)
|
|
1431
|
+
# FastCCC expects a directory containing interaction_table.csv and other files
|
|
1432
|
+
# Check for bundled database in chatspatial package
|
|
1433
|
+
chatspatial_pkg_dir = os.path.dirname(os.path.dirname(__file__))
|
|
1434
|
+
database_dir = os.path.join(
|
|
1435
|
+
chatspatial_pkg_dir,
|
|
1436
|
+
"data",
|
|
1437
|
+
"cellphonedb_v5",
|
|
1438
|
+
"cellphonedb-data-5.0.0",
|
|
1439
|
+
)
|
|
1440
|
+
|
|
1441
|
+
# Verify required files exist
|
|
1442
|
+
required_file = os.path.join(database_dir, "interaction_table.csv")
|
|
1443
|
+
if not os.path.exists(required_file):
|
|
1444
|
+
raise ProcessingError(
|
|
1445
|
+
f"FastCCC requires CellPhoneDB database files. "
|
|
1446
|
+
f"Expected directory: {database_dir} with interaction_table.csv. "
|
|
1447
|
+
f"Please download from: https://github.com/ventolab/cellphonedb-data"
|
|
1448
|
+
)
|
|
1449
|
+
|
|
1450
|
+
# Output directory for results
|
|
1451
|
+
output_dir = os.path.join(temp_dir, "results")
|
|
1452
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
1453
|
+
|
|
1454
|
+
# Run FastCCC analysis
|
|
1455
|
+
if params.fastccc_use_cauchy:
|
|
1456
|
+
# Cauchy combination method (more robust, multiple parameter combinations)
|
|
1457
|
+
# Note: This function saves results to files and returns None
|
|
1458
|
+
Cauchy_combination_of_statistical_analysis_methods(
|
|
1459
|
+
database_file_path=database_dir,
|
|
1460
|
+
celltype_file_path=None, # Using meta_key instead
|
|
1461
|
+
counts_file_path=counts_file,
|
|
1462
|
+
convert_type="hgnc_symbol",
|
|
1463
|
+
single_unit_summary_list=[
|
|
1464
|
+
"Mean",
|
|
1465
|
+
"Median",
|
|
1466
|
+
"Q3",
|
|
1467
|
+
"Quantile_0.9",
|
|
1468
|
+
],
|
|
1469
|
+
complex_aggregation_list=["Minimum", "Average"],
|
|
1470
|
+
LR_combination_list=["Arithmetic", "Geometric"],
|
|
1471
|
+
min_percentile=params.fastccc_min_percentile,
|
|
1472
|
+
save_path=output_dir,
|
|
1473
|
+
meta_key=params.cell_type_key,
|
|
1474
|
+
use_DEG=params.fastccc_use_deg,
|
|
1475
|
+
)
|
|
1476
|
+
|
|
1477
|
+
# Read results from saved files (Cauchy method saves to files)
|
|
1478
|
+
# Find the task ID from output files
|
|
1479
|
+
pval_files = glob.glob(os.path.join(output_dir, "*_Cauchy_pvals.tsv"))
|
|
1480
|
+
if not pval_files:
|
|
1481
|
+
raise ProcessingError(
|
|
1482
|
+
"FastCCC Cauchy combination did not produce output files"
|
|
1483
|
+
)
|
|
1484
|
+
|
|
1485
|
+
# Extract task_id from filename
|
|
1486
|
+
pval_file = pval_files[0]
|
|
1487
|
+
task_id = os.path.basename(pval_file).replace("_Cauchy_pvals.tsv", "")
|
|
1488
|
+
|
|
1489
|
+
# Read combined results
|
|
1490
|
+
pvalues = pd.read_csv(pval_file, index_col=0, sep="\t")
|
|
1491
|
+
strength_file = os.path.join(
|
|
1492
|
+
output_dir, f"{task_id}_average_interactions_strength.tsv"
|
|
1493
|
+
)
|
|
1494
|
+
interactions_strength = pd.read_csv(
|
|
1495
|
+
strength_file, index_col=0, sep="\t"
|
|
1496
|
+
)
|
|
1497
|
+
|
|
1498
|
+
# Percentages are in individual method files, use first one
|
|
1499
|
+
pct_files = glob.glob(
|
|
1500
|
+
os.path.join(output_dir, f"{task_id}*percents_analysis.tsv")
|
|
1501
|
+
)
|
|
1502
|
+
if pct_files:
|
|
1503
|
+
percentages = pd.read_csv(pct_files[0], index_col=0, sep="\t")
|
|
1504
|
+
else:
|
|
1505
|
+
percentages = None
|
|
1506
|
+
|
|
1507
|
+
else:
|
|
1508
|
+
# Single method (faster)
|
|
1509
|
+
interactions_strength, pvalues, percentages = (
|
|
1510
|
+
statistical_analysis_method(
|
|
1511
|
+
database_file_path=database_dir,
|
|
1512
|
+
celltype_file_path=None, # Using meta_key instead
|
|
1513
|
+
counts_file_path=counts_file,
|
|
1514
|
+
convert_type="hgnc_symbol",
|
|
1515
|
+
single_unit_summary=params.fastccc_single_unit_summary,
|
|
1516
|
+
complex_aggregation=params.fastccc_complex_aggregation,
|
|
1517
|
+
LR_combination=params.fastccc_lr_combination,
|
|
1518
|
+
min_percentile=params.fastccc_min_percentile,
|
|
1519
|
+
save_path=output_dir,
|
|
1520
|
+
meta_key=params.cell_type_key,
|
|
1521
|
+
use_DEG=params.fastccc_use_deg,
|
|
1522
|
+
)
|
|
1523
|
+
)
|
|
1524
|
+
|
|
1525
|
+
# Process results
|
|
1526
|
+
n_lr_pairs = len(pvalues) if pvalues is not None else 0
|
|
1527
|
+
|
|
1528
|
+
# Count significant pairs
|
|
1529
|
+
threshold = params.fastccc_pvalue_threshold
|
|
1530
|
+
if pvalues is not None and hasattr(pvalues, "values"):
|
|
1531
|
+
# Get minimum p-value across all cell type pairs for each LR pair
|
|
1532
|
+
pval_array = pvalues.select_dtypes(include=[np.number]).values
|
|
1533
|
+
min_pvals = np.nanmin(pval_array, axis=1)
|
|
1534
|
+
n_significant_pairs = int(np.sum(min_pvals < threshold))
|
|
1535
|
+
else:
|
|
1536
|
+
n_significant_pairs = 0
|
|
1537
|
+
|
|
1538
|
+
# Get top LR pairs based on interaction strength
|
|
1539
|
+
top_lr_pairs = []
|
|
1540
|
+
detected_lr_pairs = []
|
|
1541
|
+
if interactions_strength is not None and hasattr(
|
|
1542
|
+
interactions_strength, "index"
|
|
1543
|
+
):
|
|
1544
|
+
# Sort by mean interaction strength across cell type pairs
|
|
1545
|
+
if hasattr(interactions_strength, "select_dtypes"):
|
|
1546
|
+
strength_array = interactions_strength.select_dtypes(
|
|
1547
|
+
include=[np.number]
|
|
1548
|
+
).values
|
|
1549
|
+
mean_strength = np.nanmean(strength_array, axis=1)
|
|
1550
|
+
top_indices = np.argsort(mean_strength)[::-1][: params.plot_top_pairs]
|
|
1551
|
+
top_lr_pairs = [interactions_strength.index[i] for i in top_indices]
|
|
1552
|
+
|
|
1553
|
+
# Parse LR pair names
|
|
1554
|
+
for pair_str in top_lr_pairs:
|
|
1555
|
+
if "_" in pair_str:
|
|
1556
|
+
parts = pair_str.split("_", 1)
|
|
1557
|
+
if len(parts) == 2:
|
|
1558
|
+
detected_lr_pairs.append((parts[0], parts[1]))
|
|
1559
|
+
|
|
1560
|
+
# Store results in adata
|
|
1561
|
+
adata.uns["fastccc_interactions_strength"] = interactions_strength
|
|
1562
|
+
adata.uns["fastccc_pvalues"] = pvalues
|
|
1563
|
+
adata.uns["fastccc_percentages"] = percentages
|
|
1564
|
+
|
|
1565
|
+
# Store standardized format for visualization
|
|
1566
|
+
adata.uns["detected_lr_pairs"] = detected_lr_pairs
|
|
1567
|
+
adata.uns["cell_communication_results"] = {
|
|
1568
|
+
"top_lr_pairs": top_lr_pairs,
|
|
1569
|
+
"method": "fastccc",
|
|
1570
|
+
"n_pairs": len(top_lr_pairs),
|
|
1571
|
+
"species": params.species,
|
|
1572
|
+
}
|
|
1573
|
+
|
|
1574
|
+
end_time = time.time()
|
|
1575
|
+
analysis_time = end_time - start_time
|
|
1576
|
+
|
|
1577
|
+
statistics = {
|
|
1578
|
+
"method": "fastccc",
|
|
1579
|
+
"species": params.species,
|
|
1580
|
+
"use_cauchy": params.fastccc_use_cauchy,
|
|
1581
|
+
"single_unit_summary": params.fastccc_single_unit_summary,
|
|
1582
|
+
"complex_aggregation": params.fastccc_complex_aggregation,
|
|
1583
|
+
"lr_combination": params.fastccc_lr_combination,
|
|
1584
|
+
"min_percentile": params.fastccc_min_percentile,
|
|
1585
|
+
"pvalue_threshold": threshold,
|
|
1586
|
+
"use_deg": params.fastccc_use_deg,
|
|
1587
|
+
"n_lr_pairs_tested": n_lr_pairs,
|
|
1588
|
+
"analysis_time_seconds": analysis_time,
|
|
1589
|
+
"permutation_free": True, # Key FastCCC feature
|
|
1590
|
+
}
|
|
1591
|
+
|
|
1592
|
+
return {
|
|
1593
|
+
"n_lr_pairs": n_lr_pairs,
|
|
1594
|
+
"n_significant_pairs": n_significant_pairs,
|
|
1595
|
+
"top_lr_pairs": top_lr_pairs,
|
|
1596
|
+
"fastccc_results_key": "fastccc_interactions_strength",
|
|
1597
|
+
"fastccc_pvalues_key": "fastccc_pvalues",
|
|
1598
|
+
"analysis_type": "fastccc_permutation_free",
|
|
1599
|
+
"statistics": statistics,
|
|
1600
|
+
}
|
|
1601
|
+
|
|
1602
|
+
except Exception as e:
|
|
1603
|
+
raise ProcessingError(f"FastCCC analysis failed: {e}") from e
|