chatspatial 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. chatspatial/__init__.py +11 -0
  2. chatspatial/__main__.py +141 -0
  3. chatspatial/cli/__init__.py +7 -0
  4. chatspatial/config.py +53 -0
  5. chatspatial/models/__init__.py +85 -0
  6. chatspatial/models/analysis.py +513 -0
  7. chatspatial/models/data.py +2462 -0
  8. chatspatial/server.py +1763 -0
  9. chatspatial/spatial_mcp_adapter.py +720 -0
  10. chatspatial/tools/__init__.py +3 -0
  11. chatspatial/tools/annotation.py +1903 -0
  12. chatspatial/tools/cell_communication.py +1603 -0
  13. chatspatial/tools/cnv_analysis.py +605 -0
  14. chatspatial/tools/condition_comparison.py +595 -0
  15. chatspatial/tools/deconvolution/__init__.py +402 -0
  16. chatspatial/tools/deconvolution/base.py +318 -0
  17. chatspatial/tools/deconvolution/card.py +244 -0
  18. chatspatial/tools/deconvolution/cell2location.py +326 -0
  19. chatspatial/tools/deconvolution/destvi.py +144 -0
  20. chatspatial/tools/deconvolution/flashdeconv.py +101 -0
  21. chatspatial/tools/deconvolution/rctd.py +317 -0
  22. chatspatial/tools/deconvolution/spotlight.py +216 -0
  23. chatspatial/tools/deconvolution/stereoscope.py +109 -0
  24. chatspatial/tools/deconvolution/tangram.py +135 -0
  25. chatspatial/tools/differential.py +625 -0
  26. chatspatial/tools/embeddings.py +298 -0
  27. chatspatial/tools/enrichment.py +1863 -0
  28. chatspatial/tools/integration.py +807 -0
  29. chatspatial/tools/preprocessing.py +723 -0
  30. chatspatial/tools/spatial_domains.py +808 -0
  31. chatspatial/tools/spatial_genes.py +836 -0
  32. chatspatial/tools/spatial_registration.py +441 -0
  33. chatspatial/tools/spatial_statistics.py +1476 -0
  34. chatspatial/tools/trajectory.py +495 -0
  35. chatspatial/tools/velocity.py +405 -0
  36. chatspatial/tools/visualization/__init__.py +155 -0
  37. chatspatial/tools/visualization/basic.py +393 -0
  38. chatspatial/tools/visualization/cell_comm.py +699 -0
  39. chatspatial/tools/visualization/cnv.py +320 -0
  40. chatspatial/tools/visualization/core.py +684 -0
  41. chatspatial/tools/visualization/deconvolution.py +852 -0
  42. chatspatial/tools/visualization/enrichment.py +660 -0
  43. chatspatial/tools/visualization/integration.py +205 -0
  44. chatspatial/tools/visualization/main.py +164 -0
  45. chatspatial/tools/visualization/multi_gene.py +739 -0
  46. chatspatial/tools/visualization/persistence.py +335 -0
  47. chatspatial/tools/visualization/spatial_stats.py +469 -0
  48. chatspatial/tools/visualization/trajectory.py +639 -0
  49. chatspatial/tools/visualization/velocity.py +411 -0
  50. chatspatial/utils/__init__.py +115 -0
  51. chatspatial/utils/adata_utils.py +1372 -0
  52. chatspatial/utils/compute.py +327 -0
  53. chatspatial/utils/data_loader.py +499 -0
  54. chatspatial/utils/dependency_manager.py +462 -0
  55. chatspatial/utils/device_utils.py +165 -0
  56. chatspatial/utils/exceptions.py +185 -0
  57. chatspatial/utils/image_utils.py +267 -0
  58. chatspatial/utils/mcp_utils.py +137 -0
  59. chatspatial/utils/path_utils.py +243 -0
  60. chatspatial/utils/persistence.py +78 -0
  61. chatspatial/utils/scipy_compat.py +143 -0
  62. chatspatial-1.1.0.dist-info/METADATA +242 -0
  63. chatspatial-1.1.0.dist-info/RECORD +67 -0
  64. chatspatial-1.1.0.dist-info/WHEEL +5 -0
  65. chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
  66. chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
  67. chatspatial-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,605 @@
1
+ """
2
+ Copy Number Variation (CNV) analysis tools for spatial transcriptomics data.
3
+ """
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ import numpy as np
8
+ import scanpy as sc
9
+
10
+ if TYPE_CHECKING:
11
+ from ..spatial_mcp_adapter import ToolContext
12
+
13
+ from ..models.analysis import CNVResult
14
+ from ..models.data import CNVParameters
15
+ from ..utils import validate_obs_column
16
+ from ..utils.dependency_manager import require
17
+ from ..utils.exceptions import (
18
+ DataCompatibilityError,
19
+ DataNotFoundError,
20
+ DependencyError,
21
+ ParameterError,
22
+ ProcessingError,
23
+ )
24
+
25
+ # Numbat availability is checked lazily in _infer_cnv_numbat to avoid
26
+ # import-time failures when rpy2/R is not installed
27
+
28
+
29
+ async def infer_cnv(
30
+ data_id: str,
31
+ ctx: "ToolContext",
32
+ params: CNVParameters,
33
+ ) -> CNVResult:
34
+ """Infer copy number variations using selected method
35
+
36
+ Supports two methods:
37
+ - infercnvpy: Expression-based CNV inference (default, fast)
38
+ - Numbat: Haplotype-aware CNV analysis (requires allele data, more accurate)
39
+
40
+ Args:
41
+ data_id: Dataset identifier
42
+ ctx: Tool context for data access and logging
43
+ params: CNV analysis parameters including method selection
44
+
45
+ Returns:
46
+ CNVResult containing method-specific CNV analysis results
47
+
48
+ Raises:
49
+ ValueError: If dataset not found or parameters are invalid
50
+ RuntimeError: If selected method is not available
51
+ """
52
+ # Retrieve the AnnData object via ToolContext
53
+ adata = await ctx.get_adata(data_id)
54
+
55
+ # Validate common parameters
56
+ validate_obs_column(adata, params.reference_key, "Reference cell type")
57
+
58
+ available_categories = set(adata.obs[params.reference_key].unique())
59
+ missing_categories = set(params.reference_categories) - available_categories
60
+ if missing_categories:
61
+ raise ParameterError(
62
+ f"Reference categories {missing_categories} not found in "
63
+ f"adata.obs['{params.reference_key}'].\n"
64
+ f"Available categories: {sorted(available_categories)}"
65
+ )
66
+
67
+ # Dispatch to appropriate method
68
+ if params.method == "infercnvpy":
69
+ return await _infer_cnv_infercnvpy(data_id, adata, params, ctx)
70
+ elif params.method == "numbat":
71
+ return _infer_cnv_numbat(data_id, adata, params, ctx)
72
+ else:
73
+ raise ParameterError(
74
+ f"Unknown CNV method: {params.method}. "
75
+ "Available methods: 'infercnvpy', 'numbat'"
76
+ )
77
+
78
+
79
+ async def _infer_cnv_infercnvpy(
80
+ data_id: str,
81
+ adata,
82
+ params: CNVParameters,
83
+ ctx: "ToolContext",
84
+ ) -> CNVResult:
85
+ """Infer copy number variations using infercnvpy
86
+
87
+ This function performs CNV inference on spatial transcriptomics data using
88
+ infercnvpy, which detects chromosomal copy number alterations by comparing
89
+ gene expression patterns across chromosomes between tumor and normal cells.
90
+
91
+ Args:
92
+ data_id: Dataset identifier (for result creation)
93
+ adata: AnnData object (already retrieved via ctx.get_adata)
94
+ params: CNV analysis parameters
95
+ ctx: Tool context for logging
96
+
97
+ Returns:
98
+ CNVResult containing CNV analysis results and statistics
99
+ """
100
+ # Check if infercnvpy is available using centralized dependency manager
101
+ require("infercnvpy", ctx, feature="CNV analysis")
102
+ import infercnvpy as cnv
103
+
104
+ # Note: adata is already validated in infer_cnv() before dispatch
105
+ # Create a copy of adata for CNV analysis
106
+ adata_cnv = adata.copy()
107
+
108
+ # Check if gene position information is available
109
+ if "chromosome" not in adata_cnv.var.columns:
110
+ await ctx.warning(
111
+ "No chromosome information found in adata.var. "
112
+ "Attempting to infer from gene names..."
113
+ )
114
+ try:
115
+ # Try to infer gene positions from infercnvpy's built-in database
116
+ cnv.tl.infercnv(
117
+ adata_cnv,
118
+ reference_key=params.reference_key,
119
+ reference_cat=params.reference_categories,
120
+ window_size=params.window_size,
121
+ step=params.step,
122
+ dynamic_threshold=params.dynamic_threshold,
123
+ )
124
+ except Exception as e:
125
+ raise ProcessingError(
126
+ f"CNV inference failed. Gene positions required: {e}"
127
+ ) from e
128
+ else:
129
+ # Gene positions are available, run CNV inference
130
+ # Exclude chromosomes if specified
131
+ if params.exclude_chromosomes:
132
+ genes_to_keep = ~adata_cnv.var["chromosome"].isin(
133
+ params.exclude_chromosomes
134
+ )
135
+ adata_cnv = adata_cnv[:, genes_to_keep].copy()
136
+
137
+ # Run infercnvpy
138
+ cnv.tl.infercnv(
139
+ adata_cnv,
140
+ reference_key=params.reference_key,
141
+ reference_cat=params.reference_categories,
142
+ window_size=params.window_size,
143
+ step=params.step,
144
+ dynamic_threshold=params.dynamic_threshold,
145
+ )
146
+
147
+ # Optional: Cluster cells by CNV pattern
148
+ if params.cluster_cells:
149
+ try:
150
+ sc.pp.neighbors(adata_cnv, use_rep="X_cnv", n_neighbors=15)
151
+ sc.tl.leiden(adata_cnv, key_added="cnv_clusters")
152
+ except Exception as e:
153
+ await ctx.warning(f"Failed to cluster cells by CNV: {e}")
154
+
155
+ # Optional: Compute dendrogram
156
+ if params.dendrogram and params.cluster_cells:
157
+ try:
158
+ sc.tl.dendrogram(adata_cnv, groupby="cnv_clusters")
159
+ except Exception as e:
160
+ await ctx.warning(f"Failed to compute dendrogram: {e}")
161
+
162
+ # Extract CNV statistics
163
+
164
+ # Check what data is available
165
+ cnv_score_key = None
166
+ if "X_cnv" in adata_cnv.obsm:
167
+ cnv_score_key = "X_cnv"
168
+ elif "cnv" in adata_cnv.layers:
169
+ cnv_score_key = "cnv"
170
+
171
+ # Calculate statistics
172
+ statistics = {}
173
+ if cnv_score_key and cnv_score_key in adata_cnv.obsm:
174
+ cnv_matrix = adata_cnv.obsm[cnv_score_key]
175
+
176
+ # ==================== OPTIMIZED: Compute statistics on sparse matrix ====================
177
+ # Strategy: infercnvpy outputs sparse CSR matrix after noise filtering (Line 448-452)
178
+ # Noise filtering sets ~87% values to zero, making sparse computation efficient
179
+ # Benefit: For 5k cells × 500 windows: save ~19 MB (50%), 1.6x faster
180
+ # Technical: All statistics (mean, std, median, per-cell scores) can be computed
181
+ # directly on sparse matrices without conversion to dense
182
+
183
+ import scipy.sparse
184
+
185
+ if scipy.sparse.issparse(cnv_matrix):
186
+ # Sparse matrix - compute statistics without toarray()
187
+
188
+ # Mean: use sparse matrix's mean() method
189
+ statistics["mean_cnv"] = float(cnv_matrix.mean())
190
+
191
+ # Std: manual calculation using E[X^2] - E[X]^2
192
+ mean_val = cnv_matrix.mean()
193
+ mean_sq = cnv_matrix.multiply(cnv_matrix).mean()
194
+ statistics["std_cnv"] = float(np.sqrt(mean_sq - mean_val**2))
195
+
196
+ # Median: for highly sparse matrices (>50% zeros), median is 0
197
+ # Otherwise use approximation with non-zero values
198
+ n_zeros = cnv_matrix.shape[0] * cnv_matrix.shape[1] - cnv_matrix.nnz
199
+ n_total = cnv_matrix.shape[0] * cnv_matrix.shape[1]
200
+
201
+ if n_zeros > n_total / 2:
202
+ # Majority zeros, median is exactly 0
203
+ statistics["median_cnv"] = 0.0
204
+ else:
205
+ # Use non-zero median as approximation
206
+ statistics["median_cnv"] = float(np.median(cnv_matrix.data))
207
+
208
+ # Per-cell CNV scores: compute on sparse matrix
209
+ # abs() preserves sparsity
210
+ cnv_abs = cnv_matrix.copy()
211
+ cnv_abs.data = np.abs(cnv_abs.data)
212
+ cell_cnv_scores = np.array(cnv_abs.mean(axis=1)).flatten()
213
+ statistics["mean_cell_cnv_score"] = float(np.mean(cell_cnv_scores))
214
+ statistics["max_cell_cnv_score"] = float(np.max(cell_cnv_scores))
215
+
216
+ else:
217
+ # Dense matrix - use standard numpy operations
218
+ statistics["mean_cnv"] = float(np.mean(cnv_matrix))
219
+ statistics["std_cnv"] = float(np.std(cnv_matrix))
220
+ statistics["median_cnv"] = float(np.median(cnv_matrix))
221
+
222
+ # Calculate per-cell CNV scores
223
+ cell_cnv_scores = np.mean(np.abs(cnv_matrix), axis=1)
224
+ statistics["mean_cell_cnv_score"] = float(np.mean(cell_cnv_scores))
225
+ statistics["max_cell_cnv_score"] = float(np.max(cell_cnv_scores))
226
+
227
+ # Count reference vs non-reference cells
228
+ is_reference = adata_cnv.obs[params.reference_key].isin(params.reference_categories)
229
+ statistics["n_reference_cells"] = int(is_reference.sum())
230
+ statistics["n_non_reference_cells"] = int((~is_reference).sum())
231
+
232
+ # Get chromosome information
233
+ if "chromosome" in adata_cnv.var.columns:
234
+ n_chromosomes = len(adata_cnv.var["chromosome"].unique())
235
+ else:
236
+ n_chromosomes = 0 # Unknown
237
+
238
+ n_genes_analyzed = adata_cnv.n_vars
239
+
240
+ # Store CNV results back in the original adata object
241
+ if cnv_score_key and cnv_score_key in adata_cnv.obsm:
242
+ adata.obsm[cnv_score_key] = adata_cnv.obsm[cnv_score_key]
243
+
244
+ # Store CNV metadata (required for infercnvpy plotting functions)
245
+ if "cnv" in adata_cnv.uns:
246
+ adata.uns["cnv"] = adata_cnv.uns["cnv"]
247
+
248
+ if params.cluster_cells and "cnv_clusters" in adata_cnv.obs:
249
+ adata.obs["cnv_clusters"] = adata_cnv.obs["cnv_clusters"]
250
+
251
+ if params.dendrogram and "dendrogram_cnv_clusters" in adata_cnv.uns:
252
+ adata.uns["dendrogram_cnv_clusters"] = adata_cnv.uns["dendrogram_cnv_clusters"]
253
+
254
+ # Store CNV analysis parameters in adata.uns for reference
255
+ adata.uns["cnv_analysis"] = {
256
+ "reference_key": params.reference_key,
257
+ "reference_categories": list(params.reference_categories), # Convert to list
258
+ "window_size": params.window_size,
259
+ "step": params.step,
260
+ "cnv_score_key": cnv_score_key,
261
+ }
262
+
263
+ return CNVResult(
264
+ data_id=data_id,
265
+ method="infercnvpy",
266
+ reference_key=params.reference_key,
267
+ reference_categories=list(params.reference_categories), # Convert to list
268
+ n_chromosomes=n_chromosomes,
269
+ n_genes_analyzed=n_genes_analyzed,
270
+ cnv_score_key=cnv_score_key,
271
+ statistics=statistics,
272
+ visualization_available=cnv_score_key is not None,
273
+ )
274
+
275
+
276
+ def _infer_cnv_numbat(
277
+ data_id: str,
278
+ adata,
279
+ params: CNVParameters,
280
+ ctx: "ToolContext",
281
+ ) -> CNVResult:
282
+ """Infer copy number variations using Numbat (haplotype-aware)
283
+
284
+ Numbat performs haplotype-aware CNV analysis by integrating allele-specific
285
+ counts with expression data, enabling detection of copy-neutral LOH and
286
+ reconstruction of tumor phylogeny.
287
+
288
+ Args:
289
+ data_id: Dataset identifier (for result creation)
290
+ adata: AnnData object (already retrieved via ctx.get_adata)
291
+ params: CNV analysis parameters
292
+ ctx: Tool context for logging
293
+
294
+ Returns:
295
+ CNVResult containing Numbat CNV analysis results
296
+
297
+ Raises:
298
+ RuntimeError: If Numbat is not available or allele data is missing
299
+ ValueError: If dataset or parameters are invalid
300
+ """
301
+ # Lazy import and check for Numbat availability
302
+ # Note: Numbat requires rpy2 + R + Numbat R package - cannot use centralized manager
303
+ try:
304
+ import anndata2ri
305
+ import rpy2.robjects as ro
306
+ from rpy2.rinterface_lib import openrlib
307
+ from rpy2.robjects import conversion, default_converter, numpy2ri, pandas2ri
308
+
309
+ # Test if Numbat R package is available
310
+ ro.r("suppressPackageStartupMessages(library(numbat))")
311
+ except ImportError as e:
312
+ raise DependencyError(f"rpy2 not installed: {e}") from e
313
+ except Exception as e:
314
+ raise DependencyError(f"Numbat R package unavailable: {e}") from e
315
+
316
+ # Note: adata is already retrieved in infer_cnv() before dispatch
317
+
318
+ # Validate allele data exists
319
+ # Numbat requires long-format allele dataframe (from pileup_and_phase or similar)
320
+ # Check if we have the raw allele dataframe in adata.uns
321
+ if "numbat_allele_data_raw" in adata.uns:
322
+ # Use pre-prepared long-format allele data
323
+ import pandas as pd
324
+
325
+ df_allele = adata.uns["numbat_allele_data_raw"]
326
+
327
+ # Validate required columns
328
+ required_cols = ["cell", "CHROM", "POS", "REF", "ALT", "AD", "DP"]
329
+ missing_cols = [col for col in required_cols if col not in df_allele.columns]
330
+
331
+ if missing_cols:
332
+ raise ParameterError(
333
+ f"Allele dataframe missing required columns: {missing_cols}\n"
334
+ f"Available columns: {list(df_allele.columns)}\n"
335
+ "Numbat requires: cell, CHROM, POS, REF, ALT, AD (alt count), "
336
+ "DP (total depth)"
337
+ )
338
+
339
+ else:
340
+ # Fallback: try to use matrix format (less ideal for Numbat)
341
+ raise ParameterError(
342
+ "Numbat requires long-format allele dataframe in adata.uns['numbat_allele_data_raw'].\n"
343
+ "This should be created during data preparation (e.g., from pileup_and_phase).\n"
344
+ "The dataframe should have columns: cell, CHROM, POS, REF, ALT, AD, DP, etc.\n"
345
+ f"Available uns keys: {list(adata.uns.keys())}"
346
+ )
347
+
348
+ # Get expression matrix
349
+ count_mat = adata.X
350
+
351
+ # Prepare metadata
352
+ gene_names = list(adata.var_names)
353
+ cell_barcodes = list(adata.obs_names)
354
+
355
+ # Identify reference cells (1-indexed for R)
356
+ ref_mask = adata.obs[params.reference_key].isin(params.reference_categories)
357
+ ref_indices_python = [i for i, is_ref in enumerate(ref_mask) if is_ref]
358
+ ref_indices_r = [i + 1 for i in ref_indices_python] # R is 1-indexed
359
+
360
+ if not ref_indices_r:
361
+ raise ParameterError(
362
+ f"No reference cells found with key '{params.reference_key}' and "
363
+ f"categories {params.reference_categories}"
364
+ )
365
+
366
+ # Create temporary directory for Numbat output
367
+ import os
368
+ import shutil
369
+ import tempfile
370
+
371
+ out_dir = tempfile.mkdtemp(prefix="numbat_", dir=tempfile.gettempdir())
372
+
373
+ try:
374
+ # Use sparkx-style context management for ALL R operations
375
+ # This prevents "Conversion rules missing" errors in multithreaded/async environments
376
+ with openrlib.rlock: # Thread safety lock
377
+ with conversion.localconverter(
378
+ default_converter
379
+ + anndata2ri.converter
380
+ + pandas2ri.converter
381
+ + numpy2ri.converter
382
+ ):
383
+ # Transfer data to R environment (inside context!)
384
+ ro.globalenv["count_mat"] = count_mat.T # R expects genes × cells
385
+ ro.globalenv["df_allele_python"] = (
386
+ df_allele # Transfer allele dataframe
387
+ )
388
+ ro.globalenv["gene_names"] = gene_names
389
+ ro.globalenv["cell_barcodes"] = cell_barcodes
390
+ ro.globalenv["ref_indices"] = ref_indices_r
391
+ ro.globalenv["out_dir"] = out_dir # Output directory
392
+
393
+ # Set Numbat parameters (inside context!)
394
+ ro.globalenv["genome"] = params.numbat_genome
395
+ ro.globalenv["t_param"] = params.numbat_t
396
+ ro.globalenv["max_entropy"] = params.numbat_max_entropy
397
+ ro.globalenv["min_cells"] = params.numbat_min_cells
398
+ ro.globalenv["ncores"] = params.numbat_ncores
399
+ ro.globalenv["skip_nj"] = params.numbat_skip_nj
400
+
401
+ # Run Numbat via R (inside context!)
402
+ ro.r(
403
+ """
404
+ library(numbat)
405
+ library(dplyr)
406
+
407
+ # Keep count matrix in dgCMatrix/matrix format (do NOT convert to dataframe!)
408
+ # run_numbat requires dgCMatrix or matrix, not data.frame
409
+ # Ensure proper row/column names are set
410
+ rownames(count_mat) = gene_names
411
+ colnames(count_mat) = cell_barcodes
412
+
413
+ # Use allele dataframe from Python (already in correct format)
414
+ df_allele = df_allele_python
415
+
416
+ # Create cell annotation for reference cells
417
+ # Convert cell_barcodes to character vector (rpy2 may pass it as list)
418
+ cell_vec = as.character(unlist(cell_barcodes))
419
+ cell_annot = data.frame(
420
+ cell = cell_vec,
421
+ group = ifelse(1:length(cell_vec) %in% ref_indices, "normal", "tumor"),
422
+ stringsAsFactors = FALSE
423
+ )
424
+
425
+ # Aggregate reference expression profile from count matrix
426
+ ref_profile = aggregate_counts(count_mat, cell_annot, verbose = FALSE)
427
+
428
+ # Run Numbat with reference profile
429
+ # Note: run_numbat returns "Success" string, not results object!
430
+ # Results are saved to out_dir as TSV/RDS files
431
+ tryCatch({
432
+ result_status = run_numbat(
433
+ count_mat, # gene x cell count matrix (dgCMatrix or matrix)
434
+ ref_profile, # reference expression profile (lambdas_ref)
435
+ df_allele, # allele dataframe
436
+ genome = genome,
437
+ t = t_param,
438
+ max_entropy = max_entropy,
439
+ min_cells = min_cells,
440
+ ncores = ncores,
441
+ skip_nj = skip_nj,
442
+ plot = FALSE,
443
+ out_dir = out_dir, # Output directory for results
444
+ verbose = FALSE
445
+ )
446
+ }, error = function(e) {
447
+ stop(paste("Numbat execution failed:", e$message))
448
+ })
449
+ """
450
+ )
451
+
452
+ # Read results from output files (Numbat saves to TSV files, not R objects)
453
+ import pandas as pd
454
+
455
+ # 1. Read clone posteriors (cell-level assignments)
456
+ clone_post_file = os.path.join(out_dir, "clone_post_2.tsv")
457
+ if not os.path.exists(clone_post_file):
458
+ raise DataNotFoundError(
459
+ f"Numbat output file not found: {clone_post_file}\n"
460
+ f"Expected output files in: {out_dir}"
461
+ )
462
+
463
+ clone_post = pd.read_csv(clone_post_file, sep="\t")
464
+
465
+ # 2. Read genotype matrix (CNV states per segment)
466
+ geno_file = os.path.join(out_dir, "geno_2.tsv")
467
+ if not os.path.exists(geno_file):
468
+ raise DataNotFoundError(
469
+ f"Numbat output file not found: {geno_file}\n"
470
+ f"Expected output files in: {out_dir}"
471
+ )
472
+
473
+ geno = pd.read_csv(geno_file, sep="\t")
474
+
475
+ # 3. Read consensus segments (optional metadata)
476
+ segs_file = os.path.join(out_dir, "segs_consensus_2.tsv")
477
+ segs = None
478
+ if os.path.exists(segs_file):
479
+ segs = pd.read_csv(segs_file, sep="\t")
480
+
481
+ # 4. Check for phylogeny tree (if skip_nj=FALSE)
482
+ tree_file = os.path.join(out_dir, "tree_final_2.rds")
483
+ has_phylo = os.path.exists(tree_file)
484
+
485
+ # Process genotype matrix for AnnData storage
486
+ # geno has structure: cell | segment1 | segment2 | ...
487
+ # Convert to numpy array (cells × segments)
488
+ geno_cells = geno["cell"].values
489
+ geno_segments = geno.drop(columns=["cell"]).values
490
+
491
+ # Ensure cells are in correct order (matching adata.obs_names)
492
+ cell_order = {cell: i for i, cell in enumerate(cell_barcodes)}
493
+ geno_sorted_indices = [cell_order.get(cell, -1) for cell in geno_cells]
494
+
495
+ if -1 in geno_sorted_indices:
496
+ raise DataCompatibilityError(
497
+ "Mismatch between genotype cells and AnnData cells"
498
+ )
499
+
500
+ # Reorder genotype matrix to match AnnData cell order
501
+ cnv_matrix = np.zeros((len(cell_barcodes), geno_segments.shape[1]))
502
+ for geno_idx, adata_idx in enumerate(geno_sorted_indices):
503
+ cnv_matrix[adata_idx, :] = geno_segments[geno_idx, :]
504
+
505
+ # Store results in AnnData
506
+ adata.obsm["X_cnv_numbat"] = cnv_matrix
507
+
508
+ # Extract clone assignments and probabilities
509
+ # Match clone_post cells with adata.obs_names
510
+ clone_dict = clone_post.set_index("cell").to_dict()
511
+
512
+ # Convert numpy types to Python native types for H5AD compatibility
513
+ adata.obs["numbat_clone"] = [
514
+ str(clone_dict["clone_opt"].get(cell, "unknown")) for cell in cell_barcodes
515
+ ]
516
+ adata.obs["numbat_p_cnv"] = [
517
+ float(clone_dict["p_cnv"].get(cell, 0.0)) for cell in cell_barcodes
518
+ ]
519
+ adata.obs["numbat_compartment"] = [
520
+ str(clone_dict["compartment_opt"].get(cell, "unknown"))
521
+ for cell in cell_barcodes
522
+ ]
523
+
524
+ # Store segment information if available
525
+ if segs is not None:
526
+ # H5AD natively supports DataFrame storage in uns
527
+ # However, object columns with NaN values cause serialization errors
528
+ # Fill NaN in object columns with empty string for H5AD compatibility
529
+ segs_clean = segs.copy()
530
+ for col in segs_clean.columns:
531
+ if segs_clean[col].dtype == "object":
532
+ segs_clean[col] = segs_clean[col].fillna("")
533
+ adata.uns["numbat_segments"] = segs_clean
534
+
535
+ if has_phylo:
536
+ # Store phylogeny metadata
537
+ adata.uns["numbat_phylogeny"] = {
538
+ "available": True,
539
+ "tree_file": tree_file,
540
+ "tree_type": "phylo",
541
+ }
542
+
543
+ # Calculate statistics
544
+ statistics = {
545
+ "mean_cnv": float(np.mean(cnv_matrix)),
546
+ "std_cnv": float(np.std(cnv_matrix)),
547
+ "median_cnv": float(np.median(cnv_matrix)),
548
+ "n_clones": int(clone_post["clone_opt"].nunique()),
549
+ "mean_p_cnv": float(clone_post["p_cnv"].mean()),
550
+ "n_reference_cells": len(ref_indices_r),
551
+ "n_non_reference_cells": len(cell_barcodes) - len(ref_indices_r),
552
+ "n_segments": geno_segments.shape[1],
553
+ }
554
+
555
+ # Get clone distribution
556
+ clone_counts = clone_post["clone_opt"].value_counts()
557
+ # Type: ignore needed because mypy doesn't infer Dict[str, Any] correctly
558
+ statistics["clone_distribution"] = { # type: ignore[assignment]
559
+ str(clone): int(count) for clone, count in clone_counts.items()
560
+ }
561
+
562
+ # Store analysis parameters
563
+ adata.uns["cnv_analysis"] = {
564
+ "method": "numbat",
565
+ "reference_key": params.reference_key,
566
+ "reference_categories": list(params.reference_categories),
567
+ "genome": params.numbat_genome,
568
+ "t": params.numbat_t,
569
+ "max_entropy": params.numbat_max_entropy,
570
+ "min_cells": params.numbat_min_cells,
571
+ "cnv_score_key": "X_cnv_numbat",
572
+ }
573
+
574
+ except Exception as e:
575
+ raise ProcessingError(
576
+ f"Numbat analysis failed: {e}\n"
577
+ "Common issues:\n"
578
+ " - Allele data format incompatible\n"
579
+ " - Missing genomic position information\n"
580
+ " - Insufficient reference cells\n"
581
+ " - R environment configuration issues"
582
+ ) from e
583
+ finally:
584
+ # Cleanup: Remove temporary output directory
585
+ if os.path.exists(out_dir):
586
+ try:
587
+ shutil.rmtree(out_dir)
588
+ except Exception:
589
+ pass # Cleanup failure is not critical
590
+
591
+ # Deactivate converters
592
+ pandas2ri.deactivate()
593
+ numpy2ri.deactivate()
594
+
595
+ return CNVResult(
596
+ data_id=data_id,
597
+ method="numbat",
598
+ reference_key=params.reference_key,
599
+ reference_categories=list(params.reference_categories),
600
+ n_chromosomes=0, # Numbat doesn't report this directly
601
+ n_genes_analyzed=len(gene_names),
602
+ cnv_score_key="X_cnv_numbat",
603
+ statistics=statistics,
604
+ visualization_available=True,
605
+ )