chatspatial 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatspatial/__init__.py +11 -0
- chatspatial/__main__.py +141 -0
- chatspatial/cli/__init__.py +7 -0
- chatspatial/config.py +53 -0
- chatspatial/models/__init__.py +85 -0
- chatspatial/models/analysis.py +513 -0
- chatspatial/models/data.py +2462 -0
- chatspatial/server.py +1763 -0
- chatspatial/spatial_mcp_adapter.py +720 -0
- chatspatial/tools/__init__.py +3 -0
- chatspatial/tools/annotation.py +1903 -0
- chatspatial/tools/cell_communication.py +1603 -0
- chatspatial/tools/cnv_analysis.py +605 -0
- chatspatial/tools/condition_comparison.py +595 -0
- chatspatial/tools/deconvolution/__init__.py +402 -0
- chatspatial/tools/deconvolution/base.py +318 -0
- chatspatial/tools/deconvolution/card.py +244 -0
- chatspatial/tools/deconvolution/cell2location.py +326 -0
- chatspatial/tools/deconvolution/destvi.py +144 -0
- chatspatial/tools/deconvolution/flashdeconv.py +101 -0
- chatspatial/tools/deconvolution/rctd.py +317 -0
- chatspatial/tools/deconvolution/spotlight.py +216 -0
- chatspatial/tools/deconvolution/stereoscope.py +109 -0
- chatspatial/tools/deconvolution/tangram.py +135 -0
- chatspatial/tools/differential.py +625 -0
- chatspatial/tools/embeddings.py +298 -0
- chatspatial/tools/enrichment.py +1863 -0
- chatspatial/tools/integration.py +807 -0
- chatspatial/tools/preprocessing.py +723 -0
- chatspatial/tools/spatial_domains.py +808 -0
- chatspatial/tools/spatial_genes.py +836 -0
- chatspatial/tools/spatial_registration.py +441 -0
- chatspatial/tools/spatial_statistics.py +1476 -0
- chatspatial/tools/trajectory.py +495 -0
- chatspatial/tools/velocity.py +405 -0
- chatspatial/tools/visualization/__init__.py +155 -0
- chatspatial/tools/visualization/basic.py +393 -0
- chatspatial/tools/visualization/cell_comm.py +699 -0
- chatspatial/tools/visualization/cnv.py +320 -0
- chatspatial/tools/visualization/core.py +684 -0
- chatspatial/tools/visualization/deconvolution.py +852 -0
- chatspatial/tools/visualization/enrichment.py +660 -0
- chatspatial/tools/visualization/integration.py +205 -0
- chatspatial/tools/visualization/main.py +164 -0
- chatspatial/tools/visualization/multi_gene.py +739 -0
- chatspatial/tools/visualization/persistence.py +335 -0
- chatspatial/tools/visualization/spatial_stats.py +469 -0
- chatspatial/tools/visualization/trajectory.py +639 -0
- chatspatial/tools/visualization/velocity.py +411 -0
- chatspatial/utils/__init__.py +115 -0
- chatspatial/utils/adata_utils.py +1372 -0
- chatspatial/utils/compute.py +327 -0
- chatspatial/utils/data_loader.py +499 -0
- chatspatial/utils/dependency_manager.py +462 -0
- chatspatial/utils/device_utils.py +165 -0
- chatspatial/utils/exceptions.py +185 -0
- chatspatial/utils/image_utils.py +267 -0
- chatspatial/utils/mcp_utils.py +137 -0
- chatspatial/utils/path_utils.py +243 -0
- chatspatial/utils/persistence.py +78 -0
- chatspatial/utils/scipy_compat.py +143 -0
- chatspatial-1.1.0.dist-info/METADATA +242 -0
- chatspatial-1.1.0.dist-info/RECORD +67 -0
- chatspatial-1.1.0.dist-info/WHEEL +5 -0
- chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
- chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
- chatspatial-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,605 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copy Number Variation (CNV) analysis tools for spatial transcriptomics data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import scanpy as sc
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from ..spatial_mcp_adapter import ToolContext
|
|
12
|
+
|
|
13
|
+
from ..models.analysis import CNVResult
|
|
14
|
+
from ..models.data import CNVParameters
|
|
15
|
+
from ..utils import validate_obs_column
|
|
16
|
+
from ..utils.dependency_manager import require
|
|
17
|
+
from ..utils.exceptions import (
|
|
18
|
+
DataCompatibilityError,
|
|
19
|
+
DataNotFoundError,
|
|
20
|
+
DependencyError,
|
|
21
|
+
ParameterError,
|
|
22
|
+
ProcessingError,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Numbat availability is checked lazily in _infer_cnv_numbat to avoid
|
|
26
|
+
# import-time failures when rpy2/R is not installed
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def infer_cnv(
|
|
30
|
+
data_id: str,
|
|
31
|
+
ctx: "ToolContext",
|
|
32
|
+
params: CNVParameters,
|
|
33
|
+
) -> CNVResult:
|
|
34
|
+
"""Infer copy number variations using selected method
|
|
35
|
+
|
|
36
|
+
Supports two methods:
|
|
37
|
+
- infercnvpy: Expression-based CNV inference (default, fast)
|
|
38
|
+
- Numbat: Haplotype-aware CNV analysis (requires allele data, more accurate)
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
data_id: Dataset identifier
|
|
42
|
+
ctx: Tool context for data access and logging
|
|
43
|
+
params: CNV analysis parameters including method selection
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
CNVResult containing method-specific CNV analysis results
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
ValueError: If dataset not found or parameters are invalid
|
|
50
|
+
RuntimeError: If selected method is not available
|
|
51
|
+
"""
|
|
52
|
+
# Retrieve the AnnData object via ToolContext
|
|
53
|
+
adata = await ctx.get_adata(data_id)
|
|
54
|
+
|
|
55
|
+
# Validate common parameters
|
|
56
|
+
validate_obs_column(adata, params.reference_key, "Reference cell type")
|
|
57
|
+
|
|
58
|
+
available_categories = set(adata.obs[params.reference_key].unique())
|
|
59
|
+
missing_categories = set(params.reference_categories) - available_categories
|
|
60
|
+
if missing_categories:
|
|
61
|
+
raise ParameterError(
|
|
62
|
+
f"Reference categories {missing_categories} not found in "
|
|
63
|
+
f"adata.obs['{params.reference_key}'].\n"
|
|
64
|
+
f"Available categories: {sorted(available_categories)}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Dispatch to appropriate method
|
|
68
|
+
if params.method == "infercnvpy":
|
|
69
|
+
return await _infer_cnv_infercnvpy(data_id, adata, params, ctx)
|
|
70
|
+
elif params.method == "numbat":
|
|
71
|
+
return _infer_cnv_numbat(data_id, adata, params, ctx)
|
|
72
|
+
else:
|
|
73
|
+
raise ParameterError(
|
|
74
|
+
f"Unknown CNV method: {params.method}. "
|
|
75
|
+
"Available methods: 'infercnvpy', 'numbat'"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
async def _infer_cnv_infercnvpy(
|
|
80
|
+
data_id: str,
|
|
81
|
+
adata,
|
|
82
|
+
params: CNVParameters,
|
|
83
|
+
ctx: "ToolContext",
|
|
84
|
+
) -> CNVResult:
|
|
85
|
+
"""Infer copy number variations using infercnvpy
|
|
86
|
+
|
|
87
|
+
This function performs CNV inference on spatial transcriptomics data using
|
|
88
|
+
infercnvpy, which detects chromosomal copy number alterations by comparing
|
|
89
|
+
gene expression patterns across chromosomes between tumor and normal cells.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
data_id: Dataset identifier (for result creation)
|
|
93
|
+
adata: AnnData object (already retrieved via ctx.get_adata)
|
|
94
|
+
params: CNV analysis parameters
|
|
95
|
+
ctx: Tool context for logging
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
CNVResult containing CNV analysis results and statistics
|
|
99
|
+
"""
|
|
100
|
+
# Check if infercnvpy is available using centralized dependency manager
|
|
101
|
+
require("infercnvpy", ctx, feature="CNV analysis")
|
|
102
|
+
import infercnvpy as cnv
|
|
103
|
+
|
|
104
|
+
# Note: adata is already validated in infer_cnv() before dispatch
|
|
105
|
+
# Create a copy of adata for CNV analysis
|
|
106
|
+
adata_cnv = adata.copy()
|
|
107
|
+
|
|
108
|
+
# Check if gene position information is available
|
|
109
|
+
if "chromosome" not in adata_cnv.var.columns:
|
|
110
|
+
await ctx.warning(
|
|
111
|
+
"No chromosome information found in adata.var. "
|
|
112
|
+
"Attempting to infer from gene names..."
|
|
113
|
+
)
|
|
114
|
+
try:
|
|
115
|
+
# Try to infer gene positions from infercnvpy's built-in database
|
|
116
|
+
cnv.tl.infercnv(
|
|
117
|
+
adata_cnv,
|
|
118
|
+
reference_key=params.reference_key,
|
|
119
|
+
reference_cat=params.reference_categories,
|
|
120
|
+
window_size=params.window_size,
|
|
121
|
+
step=params.step,
|
|
122
|
+
dynamic_threshold=params.dynamic_threshold,
|
|
123
|
+
)
|
|
124
|
+
except Exception as e:
|
|
125
|
+
raise ProcessingError(
|
|
126
|
+
f"CNV inference failed. Gene positions required: {e}"
|
|
127
|
+
) from e
|
|
128
|
+
else:
|
|
129
|
+
# Gene positions are available, run CNV inference
|
|
130
|
+
# Exclude chromosomes if specified
|
|
131
|
+
if params.exclude_chromosomes:
|
|
132
|
+
genes_to_keep = ~adata_cnv.var["chromosome"].isin(
|
|
133
|
+
params.exclude_chromosomes
|
|
134
|
+
)
|
|
135
|
+
adata_cnv = adata_cnv[:, genes_to_keep].copy()
|
|
136
|
+
|
|
137
|
+
# Run infercnvpy
|
|
138
|
+
cnv.tl.infercnv(
|
|
139
|
+
adata_cnv,
|
|
140
|
+
reference_key=params.reference_key,
|
|
141
|
+
reference_cat=params.reference_categories,
|
|
142
|
+
window_size=params.window_size,
|
|
143
|
+
step=params.step,
|
|
144
|
+
dynamic_threshold=params.dynamic_threshold,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Optional: Cluster cells by CNV pattern
|
|
148
|
+
if params.cluster_cells:
|
|
149
|
+
try:
|
|
150
|
+
sc.pp.neighbors(adata_cnv, use_rep="X_cnv", n_neighbors=15)
|
|
151
|
+
sc.tl.leiden(adata_cnv, key_added="cnv_clusters")
|
|
152
|
+
except Exception as e:
|
|
153
|
+
await ctx.warning(f"Failed to cluster cells by CNV: {e}")
|
|
154
|
+
|
|
155
|
+
# Optional: Compute dendrogram
|
|
156
|
+
if params.dendrogram and params.cluster_cells:
|
|
157
|
+
try:
|
|
158
|
+
sc.tl.dendrogram(adata_cnv, groupby="cnv_clusters")
|
|
159
|
+
except Exception as e:
|
|
160
|
+
await ctx.warning(f"Failed to compute dendrogram: {e}")
|
|
161
|
+
|
|
162
|
+
# Extract CNV statistics
|
|
163
|
+
|
|
164
|
+
# Check what data is available
|
|
165
|
+
cnv_score_key = None
|
|
166
|
+
if "X_cnv" in adata_cnv.obsm:
|
|
167
|
+
cnv_score_key = "X_cnv"
|
|
168
|
+
elif "cnv" in adata_cnv.layers:
|
|
169
|
+
cnv_score_key = "cnv"
|
|
170
|
+
|
|
171
|
+
# Calculate statistics
|
|
172
|
+
statistics = {}
|
|
173
|
+
if cnv_score_key and cnv_score_key in adata_cnv.obsm:
|
|
174
|
+
cnv_matrix = adata_cnv.obsm[cnv_score_key]
|
|
175
|
+
|
|
176
|
+
# ==================== OPTIMIZED: Compute statistics on sparse matrix ====================
|
|
177
|
+
# Strategy: infercnvpy outputs sparse CSR matrix after noise filtering (Line 448-452)
|
|
178
|
+
# Noise filtering sets ~87% values to zero, making sparse computation efficient
|
|
179
|
+
# Benefit: For 5k cells × 500 windows: save ~19 MB (50%), 1.6x faster
|
|
180
|
+
# Technical: All statistics (mean, std, median, per-cell scores) can be computed
|
|
181
|
+
# directly on sparse matrices without conversion to dense
|
|
182
|
+
|
|
183
|
+
import scipy.sparse
|
|
184
|
+
|
|
185
|
+
if scipy.sparse.issparse(cnv_matrix):
|
|
186
|
+
# Sparse matrix - compute statistics without toarray()
|
|
187
|
+
|
|
188
|
+
# Mean: use sparse matrix's mean() method
|
|
189
|
+
statistics["mean_cnv"] = float(cnv_matrix.mean())
|
|
190
|
+
|
|
191
|
+
# Std: manual calculation using E[X^2] - E[X]^2
|
|
192
|
+
mean_val = cnv_matrix.mean()
|
|
193
|
+
mean_sq = cnv_matrix.multiply(cnv_matrix).mean()
|
|
194
|
+
statistics["std_cnv"] = float(np.sqrt(mean_sq - mean_val**2))
|
|
195
|
+
|
|
196
|
+
# Median: for highly sparse matrices (>50% zeros), median is 0
|
|
197
|
+
# Otherwise use approximation with non-zero values
|
|
198
|
+
n_zeros = cnv_matrix.shape[0] * cnv_matrix.shape[1] - cnv_matrix.nnz
|
|
199
|
+
n_total = cnv_matrix.shape[0] * cnv_matrix.shape[1]
|
|
200
|
+
|
|
201
|
+
if n_zeros > n_total / 2:
|
|
202
|
+
# Majority zeros, median is exactly 0
|
|
203
|
+
statistics["median_cnv"] = 0.0
|
|
204
|
+
else:
|
|
205
|
+
# Use non-zero median as approximation
|
|
206
|
+
statistics["median_cnv"] = float(np.median(cnv_matrix.data))
|
|
207
|
+
|
|
208
|
+
# Per-cell CNV scores: compute on sparse matrix
|
|
209
|
+
# abs() preserves sparsity
|
|
210
|
+
cnv_abs = cnv_matrix.copy()
|
|
211
|
+
cnv_abs.data = np.abs(cnv_abs.data)
|
|
212
|
+
cell_cnv_scores = np.array(cnv_abs.mean(axis=1)).flatten()
|
|
213
|
+
statistics["mean_cell_cnv_score"] = float(np.mean(cell_cnv_scores))
|
|
214
|
+
statistics["max_cell_cnv_score"] = float(np.max(cell_cnv_scores))
|
|
215
|
+
|
|
216
|
+
else:
|
|
217
|
+
# Dense matrix - use standard numpy operations
|
|
218
|
+
statistics["mean_cnv"] = float(np.mean(cnv_matrix))
|
|
219
|
+
statistics["std_cnv"] = float(np.std(cnv_matrix))
|
|
220
|
+
statistics["median_cnv"] = float(np.median(cnv_matrix))
|
|
221
|
+
|
|
222
|
+
# Calculate per-cell CNV scores
|
|
223
|
+
cell_cnv_scores = np.mean(np.abs(cnv_matrix), axis=1)
|
|
224
|
+
statistics["mean_cell_cnv_score"] = float(np.mean(cell_cnv_scores))
|
|
225
|
+
statistics["max_cell_cnv_score"] = float(np.max(cell_cnv_scores))
|
|
226
|
+
|
|
227
|
+
# Count reference vs non-reference cells
|
|
228
|
+
is_reference = adata_cnv.obs[params.reference_key].isin(params.reference_categories)
|
|
229
|
+
statistics["n_reference_cells"] = int(is_reference.sum())
|
|
230
|
+
statistics["n_non_reference_cells"] = int((~is_reference).sum())
|
|
231
|
+
|
|
232
|
+
# Get chromosome information
|
|
233
|
+
if "chromosome" in adata_cnv.var.columns:
|
|
234
|
+
n_chromosomes = len(adata_cnv.var["chromosome"].unique())
|
|
235
|
+
else:
|
|
236
|
+
n_chromosomes = 0 # Unknown
|
|
237
|
+
|
|
238
|
+
n_genes_analyzed = adata_cnv.n_vars
|
|
239
|
+
|
|
240
|
+
# Store CNV results back in the original adata object
|
|
241
|
+
if cnv_score_key and cnv_score_key in adata_cnv.obsm:
|
|
242
|
+
adata.obsm[cnv_score_key] = adata_cnv.obsm[cnv_score_key]
|
|
243
|
+
|
|
244
|
+
# Store CNV metadata (required for infercnvpy plotting functions)
|
|
245
|
+
if "cnv" in adata_cnv.uns:
|
|
246
|
+
adata.uns["cnv"] = adata_cnv.uns["cnv"]
|
|
247
|
+
|
|
248
|
+
if params.cluster_cells and "cnv_clusters" in adata_cnv.obs:
|
|
249
|
+
adata.obs["cnv_clusters"] = adata_cnv.obs["cnv_clusters"]
|
|
250
|
+
|
|
251
|
+
if params.dendrogram and "dendrogram_cnv_clusters" in adata_cnv.uns:
|
|
252
|
+
adata.uns["dendrogram_cnv_clusters"] = adata_cnv.uns["dendrogram_cnv_clusters"]
|
|
253
|
+
|
|
254
|
+
# Store CNV analysis parameters in adata.uns for reference
|
|
255
|
+
adata.uns["cnv_analysis"] = {
|
|
256
|
+
"reference_key": params.reference_key,
|
|
257
|
+
"reference_categories": list(params.reference_categories), # Convert to list
|
|
258
|
+
"window_size": params.window_size,
|
|
259
|
+
"step": params.step,
|
|
260
|
+
"cnv_score_key": cnv_score_key,
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
return CNVResult(
|
|
264
|
+
data_id=data_id,
|
|
265
|
+
method="infercnvpy",
|
|
266
|
+
reference_key=params.reference_key,
|
|
267
|
+
reference_categories=list(params.reference_categories), # Convert to list
|
|
268
|
+
n_chromosomes=n_chromosomes,
|
|
269
|
+
n_genes_analyzed=n_genes_analyzed,
|
|
270
|
+
cnv_score_key=cnv_score_key,
|
|
271
|
+
statistics=statistics,
|
|
272
|
+
visualization_available=cnv_score_key is not None,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _infer_cnv_numbat(
|
|
277
|
+
data_id: str,
|
|
278
|
+
adata,
|
|
279
|
+
params: CNVParameters,
|
|
280
|
+
ctx: "ToolContext",
|
|
281
|
+
) -> CNVResult:
|
|
282
|
+
"""Infer copy number variations using Numbat (haplotype-aware)
|
|
283
|
+
|
|
284
|
+
Numbat performs haplotype-aware CNV analysis by integrating allele-specific
|
|
285
|
+
counts with expression data, enabling detection of copy-neutral LOH and
|
|
286
|
+
reconstruction of tumor phylogeny.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
data_id: Dataset identifier (for result creation)
|
|
290
|
+
adata: AnnData object (already retrieved via ctx.get_adata)
|
|
291
|
+
params: CNV analysis parameters
|
|
292
|
+
ctx: Tool context for logging
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
CNVResult containing Numbat CNV analysis results
|
|
296
|
+
|
|
297
|
+
Raises:
|
|
298
|
+
RuntimeError: If Numbat is not available or allele data is missing
|
|
299
|
+
ValueError: If dataset or parameters are invalid
|
|
300
|
+
"""
|
|
301
|
+
# Lazy import and check for Numbat availability
|
|
302
|
+
# Note: Numbat requires rpy2 + R + Numbat R package - cannot use centralized manager
|
|
303
|
+
try:
|
|
304
|
+
import anndata2ri
|
|
305
|
+
import rpy2.robjects as ro
|
|
306
|
+
from rpy2.rinterface_lib import openrlib
|
|
307
|
+
from rpy2.robjects import conversion, default_converter, numpy2ri, pandas2ri
|
|
308
|
+
|
|
309
|
+
# Test if Numbat R package is available
|
|
310
|
+
ro.r("suppressPackageStartupMessages(library(numbat))")
|
|
311
|
+
except ImportError as e:
|
|
312
|
+
raise DependencyError(f"rpy2 not installed: {e}") from e
|
|
313
|
+
except Exception as e:
|
|
314
|
+
raise DependencyError(f"Numbat R package unavailable: {e}") from e
|
|
315
|
+
|
|
316
|
+
# Note: adata is already retrieved in infer_cnv() before dispatch
|
|
317
|
+
|
|
318
|
+
# Validate allele data exists
|
|
319
|
+
# Numbat requires long-format allele dataframe (from pileup_and_phase or similar)
|
|
320
|
+
# Check if we have the raw allele dataframe in adata.uns
|
|
321
|
+
if "numbat_allele_data_raw" in adata.uns:
|
|
322
|
+
# Use pre-prepared long-format allele data
|
|
323
|
+
import pandas as pd
|
|
324
|
+
|
|
325
|
+
df_allele = adata.uns["numbat_allele_data_raw"]
|
|
326
|
+
|
|
327
|
+
# Validate required columns
|
|
328
|
+
required_cols = ["cell", "CHROM", "POS", "REF", "ALT", "AD", "DP"]
|
|
329
|
+
missing_cols = [col for col in required_cols if col not in df_allele.columns]
|
|
330
|
+
|
|
331
|
+
if missing_cols:
|
|
332
|
+
raise ParameterError(
|
|
333
|
+
f"Allele dataframe missing required columns: {missing_cols}\n"
|
|
334
|
+
f"Available columns: {list(df_allele.columns)}\n"
|
|
335
|
+
"Numbat requires: cell, CHROM, POS, REF, ALT, AD (alt count), "
|
|
336
|
+
"DP (total depth)"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
else:
|
|
340
|
+
# Fallback: try to use matrix format (less ideal for Numbat)
|
|
341
|
+
raise ParameterError(
|
|
342
|
+
"Numbat requires long-format allele dataframe in adata.uns['numbat_allele_data_raw'].\n"
|
|
343
|
+
"This should be created during data preparation (e.g., from pileup_and_phase).\n"
|
|
344
|
+
"The dataframe should have columns: cell, CHROM, POS, REF, ALT, AD, DP, etc.\n"
|
|
345
|
+
f"Available uns keys: {list(adata.uns.keys())}"
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# Get expression matrix
|
|
349
|
+
count_mat = adata.X
|
|
350
|
+
|
|
351
|
+
# Prepare metadata
|
|
352
|
+
gene_names = list(adata.var_names)
|
|
353
|
+
cell_barcodes = list(adata.obs_names)
|
|
354
|
+
|
|
355
|
+
# Identify reference cells (1-indexed for R)
|
|
356
|
+
ref_mask = adata.obs[params.reference_key].isin(params.reference_categories)
|
|
357
|
+
ref_indices_python = [i for i, is_ref in enumerate(ref_mask) if is_ref]
|
|
358
|
+
ref_indices_r = [i + 1 for i in ref_indices_python] # R is 1-indexed
|
|
359
|
+
|
|
360
|
+
if not ref_indices_r:
|
|
361
|
+
raise ParameterError(
|
|
362
|
+
f"No reference cells found with key '{params.reference_key}' and "
|
|
363
|
+
f"categories {params.reference_categories}"
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# Create temporary directory for Numbat output
|
|
367
|
+
import os
|
|
368
|
+
import shutil
|
|
369
|
+
import tempfile
|
|
370
|
+
|
|
371
|
+
out_dir = tempfile.mkdtemp(prefix="numbat_", dir=tempfile.gettempdir())
|
|
372
|
+
|
|
373
|
+
try:
|
|
374
|
+
# Use sparkx-style context management for ALL R operations
|
|
375
|
+
# This prevents "Conversion rules missing" errors in multithreaded/async environments
|
|
376
|
+
with openrlib.rlock: # Thread safety lock
|
|
377
|
+
with conversion.localconverter(
|
|
378
|
+
default_converter
|
|
379
|
+
+ anndata2ri.converter
|
|
380
|
+
+ pandas2ri.converter
|
|
381
|
+
+ numpy2ri.converter
|
|
382
|
+
):
|
|
383
|
+
# Transfer data to R environment (inside context!)
|
|
384
|
+
ro.globalenv["count_mat"] = count_mat.T # R expects genes × cells
|
|
385
|
+
ro.globalenv["df_allele_python"] = (
|
|
386
|
+
df_allele # Transfer allele dataframe
|
|
387
|
+
)
|
|
388
|
+
ro.globalenv["gene_names"] = gene_names
|
|
389
|
+
ro.globalenv["cell_barcodes"] = cell_barcodes
|
|
390
|
+
ro.globalenv["ref_indices"] = ref_indices_r
|
|
391
|
+
ro.globalenv["out_dir"] = out_dir # Output directory
|
|
392
|
+
|
|
393
|
+
# Set Numbat parameters (inside context!)
|
|
394
|
+
ro.globalenv["genome"] = params.numbat_genome
|
|
395
|
+
ro.globalenv["t_param"] = params.numbat_t
|
|
396
|
+
ro.globalenv["max_entropy"] = params.numbat_max_entropy
|
|
397
|
+
ro.globalenv["min_cells"] = params.numbat_min_cells
|
|
398
|
+
ro.globalenv["ncores"] = params.numbat_ncores
|
|
399
|
+
ro.globalenv["skip_nj"] = params.numbat_skip_nj
|
|
400
|
+
|
|
401
|
+
# Run Numbat via R (inside context!)
|
|
402
|
+
ro.r(
|
|
403
|
+
"""
|
|
404
|
+
library(numbat)
|
|
405
|
+
library(dplyr)
|
|
406
|
+
|
|
407
|
+
# Keep count matrix in dgCMatrix/matrix format (do NOT convert to dataframe!)
|
|
408
|
+
# run_numbat requires dgCMatrix or matrix, not data.frame
|
|
409
|
+
# Ensure proper row/column names are set
|
|
410
|
+
rownames(count_mat) = gene_names
|
|
411
|
+
colnames(count_mat) = cell_barcodes
|
|
412
|
+
|
|
413
|
+
# Use allele dataframe from Python (already in correct format)
|
|
414
|
+
df_allele = df_allele_python
|
|
415
|
+
|
|
416
|
+
# Create cell annotation for reference cells
|
|
417
|
+
# Convert cell_barcodes to character vector (rpy2 may pass it as list)
|
|
418
|
+
cell_vec = as.character(unlist(cell_barcodes))
|
|
419
|
+
cell_annot = data.frame(
|
|
420
|
+
cell = cell_vec,
|
|
421
|
+
group = ifelse(1:length(cell_vec) %in% ref_indices, "normal", "tumor"),
|
|
422
|
+
stringsAsFactors = FALSE
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Aggregate reference expression profile from count matrix
|
|
426
|
+
ref_profile = aggregate_counts(count_mat, cell_annot, verbose = FALSE)
|
|
427
|
+
|
|
428
|
+
# Run Numbat with reference profile
|
|
429
|
+
# Note: run_numbat returns "Success" string, not results object!
|
|
430
|
+
# Results are saved to out_dir as TSV/RDS files
|
|
431
|
+
tryCatch({
|
|
432
|
+
result_status = run_numbat(
|
|
433
|
+
count_mat, # gene x cell count matrix (dgCMatrix or matrix)
|
|
434
|
+
ref_profile, # reference expression profile (lambdas_ref)
|
|
435
|
+
df_allele, # allele dataframe
|
|
436
|
+
genome = genome,
|
|
437
|
+
t = t_param,
|
|
438
|
+
max_entropy = max_entropy,
|
|
439
|
+
min_cells = min_cells,
|
|
440
|
+
ncores = ncores,
|
|
441
|
+
skip_nj = skip_nj,
|
|
442
|
+
plot = FALSE,
|
|
443
|
+
out_dir = out_dir, # Output directory for results
|
|
444
|
+
verbose = FALSE
|
|
445
|
+
)
|
|
446
|
+
}, error = function(e) {
|
|
447
|
+
stop(paste("Numbat execution failed:", e$message))
|
|
448
|
+
})
|
|
449
|
+
"""
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
# Read results from output files (Numbat saves to TSV files, not R objects)
|
|
453
|
+
import pandas as pd
|
|
454
|
+
|
|
455
|
+
# 1. Read clone posteriors (cell-level assignments)
|
|
456
|
+
clone_post_file = os.path.join(out_dir, "clone_post_2.tsv")
|
|
457
|
+
if not os.path.exists(clone_post_file):
|
|
458
|
+
raise DataNotFoundError(
|
|
459
|
+
f"Numbat output file not found: {clone_post_file}\n"
|
|
460
|
+
f"Expected output files in: {out_dir}"
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
clone_post = pd.read_csv(clone_post_file, sep="\t")
|
|
464
|
+
|
|
465
|
+
# 2. Read genotype matrix (CNV states per segment)
|
|
466
|
+
geno_file = os.path.join(out_dir, "geno_2.tsv")
|
|
467
|
+
if not os.path.exists(geno_file):
|
|
468
|
+
raise DataNotFoundError(
|
|
469
|
+
f"Numbat output file not found: {geno_file}\n"
|
|
470
|
+
f"Expected output files in: {out_dir}"
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
geno = pd.read_csv(geno_file, sep="\t")
|
|
474
|
+
|
|
475
|
+
# 3. Read consensus segments (optional metadata)
|
|
476
|
+
segs_file = os.path.join(out_dir, "segs_consensus_2.tsv")
|
|
477
|
+
segs = None
|
|
478
|
+
if os.path.exists(segs_file):
|
|
479
|
+
segs = pd.read_csv(segs_file, sep="\t")
|
|
480
|
+
|
|
481
|
+
# 4. Check for phylogeny tree (if skip_nj=FALSE)
|
|
482
|
+
tree_file = os.path.join(out_dir, "tree_final_2.rds")
|
|
483
|
+
has_phylo = os.path.exists(tree_file)
|
|
484
|
+
|
|
485
|
+
# Process genotype matrix for AnnData storage
|
|
486
|
+
# geno has structure: cell | segment1 | segment2 | ...
|
|
487
|
+
# Convert to numpy array (cells × segments)
|
|
488
|
+
geno_cells = geno["cell"].values
|
|
489
|
+
geno_segments = geno.drop(columns=["cell"]).values
|
|
490
|
+
|
|
491
|
+
# Ensure cells are in correct order (matching adata.obs_names)
|
|
492
|
+
cell_order = {cell: i for i, cell in enumerate(cell_barcodes)}
|
|
493
|
+
geno_sorted_indices = [cell_order.get(cell, -1) for cell in geno_cells]
|
|
494
|
+
|
|
495
|
+
if -1 in geno_sorted_indices:
|
|
496
|
+
raise DataCompatibilityError(
|
|
497
|
+
"Mismatch between genotype cells and AnnData cells"
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
# Reorder genotype matrix to match AnnData cell order
|
|
501
|
+
cnv_matrix = np.zeros((len(cell_barcodes), geno_segments.shape[1]))
|
|
502
|
+
for geno_idx, adata_idx in enumerate(geno_sorted_indices):
|
|
503
|
+
cnv_matrix[adata_idx, :] = geno_segments[geno_idx, :]
|
|
504
|
+
|
|
505
|
+
# Store results in AnnData
|
|
506
|
+
adata.obsm["X_cnv_numbat"] = cnv_matrix
|
|
507
|
+
|
|
508
|
+
# Extract clone assignments and probabilities
|
|
509
|
+
# Match clone_post cells with adata.obs_names
|
|
510
|
+
clone_dict = clone_post.set_index("cell").to_dict()
|
|
511
|
+
|
|
512
|
+
# Convert numpy types to Python native types for H5AD compatibility
|
|
513
|
+
adata.obs["numbat_clone"] = [
|
|
514
|
+
str(clone_dict["clone_opt"].get(cell, "unknown")) for cell in cell_barcodes
|
|
515
|
+
]
|
|
516
|
+
adata.obs["numbat_p_cnv"] = [
|
|
517
|
+
float(clone_dict["p_cnv"].get(cell, 0.0)) for cell in cell_barcodes
|
|
518
|
+
]
|
|
519
|
+
adata.obs["numbat_compartment"] = [
|
|
520
|
+
str(clone_dict["compartment_opt"].get(cell, "unknown"))
|
|
521
|
+
for cell in cell_barcodes
|
|
522
|
+
]
|
|
523
|
+
|
|
524
|
+
# Store segment information if available
|
|
525
|
+
if segs is not None:
|
|
526
|
+
# H5AD natively supports DataFrame storage in uns
|
|
527
|
+
# However, object columns with NaN values cause serialization errors
|
|
528
|
+
# Fill NaN in object columns with empty string for H5AD compatibility
|
|
529
|
+
segs_clean = segs.copy()
|
|
530
|
+
for col in segs_clean.columns:
|
|
531
|
+
if segs_clean[col].dtype == "object":
|
|
532
|
+
segs_clean[col] = segs_clean[col].fillna("")
|
|
533
|
+
adata.uns["numbat_segments"] = segs_clean
|
|
534
|
+
|
|
535
|
+
if has_phylo:
|
|
536
|
+
# Store phylogeny metadata
|
|
537
|
+
adata.uns["numbat_phylogeny"] = {
|
|
538
|
+
"available": True,
|
|
539
|
+
"tree_file": tree_file,
|
|
540
|
+
"tree_type": "phylo",
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
# Calculate statistics
|
|
544
|
+
statistics = {
|
|
545
|
+
"mean_cnv": float(np.mean(cnv_matrix)),
|
|
546
|
+
"std_cnv": float(np.std(cnv_matrix)),
|
|
547
|
+
"median_cnv": float(np.median(cnv_matrix)),
|
|
548
|
+
"n_clones": int(clone_post["clone_opt"].nunique()),
|
|
549
|
+
"mean_p_cnv": float(clone_post["p_cnv"].mean()),
|
|
550
|
+
"n_reference_cells": len(ref_indices_r),
|
|
551
|
+
"n_non_reference_cells": len(cell_barcodes) - len(ref_indices_r),
|
|
552
|
+
"n_segments": geno_segments.shape[1],
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
# Get clone distribution
|
|
556
|
+
clone_counts = clone_post["clone_opt"].value_counts()
|
|
557
|
+
# Type: ignore needed because mypy doesn't infer Dict[str, Any] correctly
|
|
558
|
+
statistics["clone_distribution"] = { # type: ignore[assignment]
|
|
559
|
+
str(clone): int(count) for clone, count in clone_counts.items()
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
# Store analysis parameters
|
|
563
|
+
adata.uns["cnv_analysis"] = {
|
|
564
|
+
"method": "numbat",
|
|
565
|
+
"reference_key": params.reference_key,
|
|
566
|
+
"reference_categories": list(params.reference_categories),
|
|
567
|
+
"genome": params.numbat_genome,
|
|
568
|
+
"t": params.numbat_t,
|
|
569
|
+
"max_entropy": params.numbat_max_entropy,
|
|
570
|
+
"min_cells": params.numbat_min_cells,
|
|
571
|
+
"cnv_score_key": "X_cnv_numbat",
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
except Exception as e:
|
|
575
|
+
raise ProcessingError(
|
|
576
|
+
f"Numbat analysis failed: {e}\n"
|
|
577
|
+
"Common issues:\n"
|
|
578
|
+
" - Allele data format incompatible\n"
|
|
579
|
+
" - Missing genomic position information\n"
|
|
580
|
+
" - Insufficient reference cells\n"
|
|
581
|
+
" - R environment configuration issues"
|
|
582
|
+
) from e
|
|
583
|
+
finally:
|
|
584
|
+
# Cleanup: Remove temporary output directory
|
|
585
|
+
if os.path.exists(out_dir):
|
|
586
|
+
try:
|
|
587
|
+
shutil.rmtree(out_dir)
|
|
588
|
+
except Exception:
|
|
589
|
+
pass # Cleanup failure is not critical
|
|
590
|
+
|
|
591
|
+
# Deactivate converters
|
|
592
|
+
pandas2ri.deactivate()
|
|
593
|
+
numpy2ri.deactivate()
|
|
594
|
+
|
|
595
|
+
return CNVResult(
|
|
596
|
+
data_id=data_id,
|
|
597
|
+
method="numbat",
|
|
598
|
+
reference_key=params.reference_key,
|
|
599
|
+
reference_categories=list(params.reference_categories),
|
|
600
|
+
n_chromosomes=0, # Numbat doesn't report this directly
|
|
601
|
+
n_genes_analyzed=len(gene_names),
|
|
602
|
+
cnv_score_key="X_cnv_numbat",
|
|
603
|
+
statistics=statistics,
|
|
604
|
+
visualization_available=True,
|
|
605
|
+
)
|