chatspatial 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. chatspatial/__init__.py +11 -0
  2. chatspatial/__main__.py +141 -0
  3. chatspatial/cli/__init__.py +7 -0
  4. chatspatial/config.py +53 -0
  5. chatspatial/models/__init__.py +85 -0
  6. chatspatial/models/analysis.py +513 -0
  7. chatspatial/models/data.py +2462 -0
  8. chatspatial/server.py +1763 -0
  9. chatspatial/spatial_mcp_adapter.py +720 -0
  10. chatspatial/tools/__init__.py +3 -0
  11. chatspatial/tools/annotation.py +1903 -0
  12. chatspatial/tools/cell_communication.py +1603 -0
  13. chatspatial/tools/cnv_analysis.py +605 -0
  14. chatspatial/tools/condition_comparison.py +595 -0
  15. chatspatial/tools/deconvolution/__init__.py +402 -0
  16. chatspatial/tools/deconvolution/base.py +318 -0
  17. chatspatial/tools/deconvolution/card.py +244 -0
  18. chatspatial/tools/deconvolution/cell2location.py +326 -0
  19. chatspatial/tools/deconvolution/destvi.py +144 -0
  20. chatspatial/tools/deconvolution/flashdeconv.py +101 -0
  21. chatspatial/tools/deconvolution/rctd.py +317 -0
  22. chatspatial/tools/deconvolution/spotlight.py +216 -0
  23. chatspatial/tools/deconvolution/stereoscope.py +109 -0
  24. chatspatial/tools/deconvolution/tangram.py +135 -0
  25. chatspatial/tools/differential.py +625 -0
  26. chatspatial/tools/embeddings.py +298 -0
  27. chatspatial/tools/enrichment.py +1863 -0
  28. chatspatial/tools/integration.py +807 -0
  29. chatspatial/tools/preprocessing.py +723 -0
  30. chatspatial/tools/spatial_domains.py +808 -0
  31. chatspatial/tools/spatial_genes.py +836 -0
  32. chatspatial/tools/spatial_registration.py +441 -0
  33. chatspatial/tools/spatial_statistics.py +1476 -0
  34. chatspatial/tools/trajectory.py +495 -0
  35. chatspatial/tools/velocity.py +405 -0
  36. chatspatial/tools/visualization/__init__.py +155 -0
  37. chatspatial/tools/visualization/basic.py +393 -0
  38. chatspatial/tools/visualization/cell_comm.py +699 -0
  39. chatspatial/tools/visualization/cnv.py +320 -0
  40. chatspatial/tools/visualization/core.py +684 -0
  41. chatspatial/tools/visualization/deconvolution.py +852 -0
  42. chatspatial/tools/visualization/enrichment.py +660 -0
  43. chatspatial/tools/visualization/integration.py +205 -0
  44. chatspatial/tools/visualization/main.py +164 -0
  45. chatspatial/tools/visualization/multi_gene.py +739 -0
  46. chatspatial/tools/visualization/persistence.py +335 -0
  47. chatspatial/tools/visualization/spatial_stats.py +469 -0
  48. chatspatial/tools/visualization/trajectory.py +639 -0
  49. chatspatial/tools/visualization/velocity.py +411 -0
  50. chatspatial/utils/__init__.py +115 -0
  51. chatspatial/utils/adata_utils.py +1372 -0
  52. chatspatial/utils/compute.py +327 -0
  53. chatspatial/utils/data_loader.py +499 -0
  54. chatspatial/utils/dependency_manager.py +462 -0
  55. chatspatial/utils/device_utils.py +165 -0
  56. chatspatial/utils/exceptions.py +185 -0
  57. chatspatial/utils/image_utils.py +267 -0
  58. chatspatial/utils/mcp_utils.py +137 -0
  59. chatspatial/utils/path_utils.py +243 -0
  60. chatspatial/utils/persistence.py +78 -0
  61. chatspatial/utils/scipy_compat.py +143 -0
  62. chatspatial-1.1.0.dist-info/METADATA +242 -0
  63. chatspatial-1.1.0.dist-info/RECORD +67 -0
  64. chatspatial-1.1.0.dist-info/WHEEL +5 -0
  65. chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
  66. chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
  67. chatspatial-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,499 @@
1
+ """
2
+ Data loading utilities for spatial transcriptomics data.
3
+
4
+ Handles loading various spatial data formats:
5
+ - H5AD files (AnnData format)
6
+ - H5 files (10x Genomics format)
7
+ - MTX directories (10x Visium structure)
8
+ - Visium directories with spatial information
9
+
10
+ For data persistence, see persistence.py.
11
+ """
12
+
13
+ import logging
14
+ import os
15
+ from typing import Any, Literal, Optional
16
+
17
+ from .adata_utils import ensure_unique_var_names, get_adata_profile
18
+ from .dependency_manager import is_available
19
+ from .exceptions import (
20
+ DataCompatibilityError,
21
+ DataNotFoundError,
22
+ ParameterError,
23
+ ProcessingError,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ async def load_spatial_data(
30
+ data_path: str,
31
+ data_type: Literal[
32
+ "10x_visium", "slide_seq", "merfish", "seqfish", "other", "auto", "h5ad"
33
+ ] = "auto",
34
+ name: Optional[str] = None,
35
+ ) -> dict[str, Any]:
36
+ """Load spatial transcriptomics data
37
+
38
+ Args:
39
+ data_path: Path to the data file or directory
40
+ data_type: Type of spatial data. If 'auto', will try to determine the type from the file extension or directory structure.
41
+ name: Optional name for the dataset
42
+
43
+ Returns:
44
+ Dictionary with dataset information and AnnData object
45
+ """
46
+ # Validate path
47
+ if not os.path.exists(data_path):
48
+ raise FileNotFoundError(f"Data path not found: {data_path}")
49
+
50
+ # Auto-detect data type if set to 'auto'
51
+ if data_type == "auto":
52
+ if os.path.isfile(data_path):
53
+ if data_path.endswith(".h5ad"):
54
+ # It's an h5ad file
55
+ data_type = "h5ad"
56
+ elif data_path.endswith(".h5"):
57
+ # It's likely a 10x H5 file
58
+ data_type = "10x_visium"
59
+ else:
60
+ # Default to other for unknown file types
61
+ data_type = "other"
62
+ elif os.path.isdir(data_path):
63
+ # Check if it has the structure of a 10x Visium dataset
64
+ if os.path.exists(
65
+ os.path.join(data_path, "filtered_feature_bc_matrix")
66
+ ) or os.path.exists(
67
+ os.path.join(data_path, "filtered_feature_bc_matrix.h5")
68
+ ):
69
+ data_type = "10x_visium"
70
+ else:
71
+ # Default to other if we can't determine
72
+ data_type = "other"
73
+ else:
74
+ # Default to other for unknown file types
75
+ data_type = "other"
76
+
77
+ # Convert h5ad to other for backward compatibility
78
+ if data_type == "h5ad":
79
+ data_type = "other"
80
+
81
+ # Import dependencies
82
+ import scanpy as sc
83
+ import squidpy as sq
84
+
85
+ # Load data based on data_type
86
+ if data_type == "10x_visium":
87
+ # For 10x Visium, we need to provide the path to the directory containing the data
88
+ try:
89
+ # Check if it's a directory or an h5ad file
90
+ if os.path.isdir(data_path):
91
+ # Check if the directory has the expected structure
92
+ if os.path.exists(
93
+ os.path.join(data_path, "filtered_feature_bc_matrix.h5")
94
+ ):
95
+ # H5 file based 10x Visium directory structure
96
+ adata = sc.read_visium(data_path)
97
+ elif os.path.exists(
98
+ os.path.join(data_path, "filtered_feature_bc_matrix")
99
+ ):
100
+ # Check if it contains MTX files (compressed or uncompressed)
101
+ mtx_dir = os.path.join(data_path, "filtered_feature_bc_matrix")
102
+ if os.path.exists(
103
+ os.path.join(mtx_dir, "matrix.mtx.gz")
104
+ ) or os.path.exists(os.path.join(mtx_dir, "matrix.mtx")):
105
+ # Matrix files based 10x Visium directory structure
106
+ # Use scanpy's read_10x_mtx function
107
+ adata = sc.read_10x_mtx(
108
+ os.path.join(data_path, "filtered_feature_bc_matrix"),
109
+ var_names="gene_symbols",
110
+ cache=False,
111
+ )
112
+ # Try to load spatial coordinates if available
113
+ spatial_dir = os.path.join(data_path, "spatial")
114
+ if os.path.exists(spatial_dir):
115
+ try:
116
+ # Add spatial information manually
117
+ import json
118
+
119
+ import pandas as pd
120
+
121
+ # Load tissue positions
122
+ positions_path = os.path.join(
123
+ spatial_dir, "tissue_positions_list.csv"
124
+ )
125
+ if os.path.exists(positions_path):
126
+ # Try to detect if file has header
127
+ with open(positions_path, "r") as f:
128
+ first_line = f.readline().strip()
129
+
130
+ if first_line.startswith("barcode"):
131
+ # File has header
132
+ positions = pd.read_csv(positions_path)
133
+ else:
134
+ # File has no header
135
+ positions = pd.read_csv(
136
+ positions_path, header=None
137
+ )
138
+ positions.columns = [
139
+ "barcode",
140
+ "in_tissue",
141
+ "array_row",
142
+ "array_col",
143
+ "pxl_row_in_fullres",
144
+ "pxl_col_in_fullres",
145
+ ]
146
+
147
+ positions.set_index("barcode", inplace=True)
148
+
149
+ # Filter for spots in tissue
150
+ positions = positions[positions["in_tissue"] == 1]
151
+
152
+ # Add spatial coordinates to adata
153
+ adata.obsm["spatial"] = positions.loc[
154
+ adata.obs_names,
155
+ ["pxl_col_in_fullres", "pxl_row_in_fullres"],
156
+ ].values
157
+
158
+ # Load scalefactors
159
+ scalefactors_path = os.path.join(
160
+ spatial_dir, "scalefactors_json.json"
161
+ )
162
+ if os.path.exists(scalefactors_path):
163
+ with open(scalefactors_path, "r") as f:
164
+ scalefactors = json.load(f)
165
+
166
+ # Add scalefactors to adata
167
+ adata.uns["spatial"] = {
168
+ "scalefactors": scalefactors
169
+ }
170
+ except Exception as e:
171
+ logger.warning(
172
+ f"Could not load spatial information: {e}"
173
+ )
174
+ else:
175
+ raise DataCompatibilityError(
176
+ f"Directory {data_path} does not have the expected 10x Visium structure"
177
+ )
178
+ elif os.path.isfile(data_path) and data_path.endswith(".h5"):
179
+ # Single H5 file - new support for 10x H5 format
180
+ adata = sc.read_10x_h5(data_path)
181
+
182
+ # Try to find and add spatial information
183
+ spatial_path = _find_spatial_folder(data_path)
184
+ if spatial_path:
185
+ try:
186
+ adata = _add_spatial_info_to_adata(adata, spatial_path)
187
+ except Exception as e:
188
+ logger.warning(f"Could not add spatial information: {e}")
189
+ elif os.path.isfile(data_path) and data_path.endswith(".h5ad"):
190
+ # If it's an h5ad file but marked as 10x_visium, read it as h5ad
191
+ adata = sc.read_h5ad(data_path)
192
+ # Check if it has the necessary spatial information
193
+ if "spatial" not in adata.uns and not any(
194
+ "spatial" in key for key in adata.obsm.keys()
195
+ ):
196
+ logger.warning(
197
+ "The h5ad file does not contain spatial information typically required for 10x Visium data"
198
+ )
199
+ else:
200
+ raise ParameterError(
201
+ f"Unsupported file format for 10x_visium: {data_path}. Supported formats: directory with Visium structure, .h5 file, or .h5ad file"
202
+ )
203
+
204
+ # Add spatial neighborhood graph if not already present
205
+ if "spatial_connectivities" not in adata.obsp and "spatial" in adata.obsm:
206
+ try:
207
+ sq.gr.spatial_neighbors(adata)
208
+ except Exception as e:
209
+ logger.warning(f"Could not compute spatial neighbors: {e}")
210
+ except FileNotFoundError as e:
211
+ raise DataNotFoundError(f"File not found: {e}") from e
212
+ except Exception as e:
213
+ # Provide more detailed error information
214
+ error_msg = f"Error loading 10x Visium data from {data_path}: {e}"
215
+
216
+ # Add helpful suggestions based on error type
217
+ if "No matching barcodes" in str(e):
218
+ error_msg += "\n\nPossible solutions:"
219
+ error_msg += "\n1. Check if the H5 file and spatial coordinates are from the same sample"
220
+ error_msg += "\n2. Verify barcode format (with or without -1 suffix)"
221
+ error_msg += "\n3. Ensure the spatial folder contains the correct tissue_positions_list.csv file"
222
+ elif ".h5" in data_path and "read_10x_h5" in str(e):
223
+ error_msg += "\n\nThis might not be a valid 10x H5 file. Try:"
224
+ error_msg += "\n1. Set data_type='h5ad' if this is an AnnData H5AD file"
225
+ error_msg += (
226
+ "\n2. Verify the file is from 10x Genomics Cell Ranger output"
227
+ )
228
+ elif "spatial" in str(e).lower():
229
+ error_msg += "\n\nSpatial data issue detected. Try:"
230
+ error_msg += (
231
+ "\n1. Loading without spatial data by using data_type='other'"
232
+ )
233
+ error_msg += "\n2. Ensuring spatial folder contains: tissue_positions_list.csv and scalefactors_json.json"
234
+
235
+ raise ProcessingError(error_msg) from e
236
+ elif data_type == "h5ad" or data_type in [
237
+ "slide_seq",
238
+ "merfish",
239
+ "seqfish",
240
+ "other",
241
+ ]:
242
+ # For h5ad files or other data types
243
+ try:
244
+ adata = sc.read_h5ad(data_path)
245
+ except Exception as e:
246
+ raise ProcessingError(f"Error loading {data_type} data: {e}") from e
247
+ else:
248
+ raise ParameterError(f"Unsupported data type: {data_type}")
249
+
250
+ # Set dataset name
251
+ dataset_name = name or os.path.basename(data_path).split(".")[0]
252
+
253
+ # Calculate basic statistics
254
+ n_cells = adata.n_obs
255
+ n_genes = adata.n_vars
256
+
257
+ # Check if spatial coordinates are available
258
+ # Priority: obsm["spatial"] is the actual coordinate storage location
259
+ # uns["spatial"] only contains metadata (scalefactors, images) not coordinates
260
+ spatial_coordinates_available = (
261
+ hasattr(adata, "obsm")
262
+ and "spatial" in adata.obsm
263
+ and adata.obsm["spatial"] is not None
264
+ and len(adata.obsm["spatial"]) > 0
265
+ )
266
+
267
+ # Check if tissue image is available (for Visium data)
268
+ # Structure: adata.uns["spatial"][library_id]["images"]["hires"/"lowres"]
269
+ # Must check for actual hires or lowres images, not just non-empty dict
270
+ tissue_image_available = False
271
+ if "spatial" in adata.uns and isinstance(adata.uns["spatial"], dict):
272
+ for _sample_key, sample_data in adata.uns["spatial"].items():
273
+ # Each sample_data should be a dict with "images" key
274
+ if isinstance(sample_data, dict) and "images" in sample_data:
275
+ images_dict = sample_data["images"]
276
+ # Check if images dict has actual hires or lowres images
277
+ if isinstance(images_dict, dict) and (
278
+ "hires" in images_dict or "lowres" in images_dict
279
+ ):
280
+ tissue_image_available = True
281
+ break
282
+
283
+ # Make variable names unique to avoid reindexing issues
284
+ ensure_unique_var_names(adata)
285
+
286
+ # Preserve raw data for downstream analysis (C2 strategy)
287
+ # Only save if .raw doesn't already exist - respect user's existing .raw
288
+ import anndata as ad
289
+
290
+ if adata.raw is None:
291
+ # Save current data state to .raw
292
+ # This ensures downstream tools always have access to original loaded data
293
+ adata.raw = ad.AnnData(
294
+ X=adata.X.copy(),
295
+ var=adata.var,
296
+ obs=adata.obs.copy(),
297
+ uns={},
298
+ )
299
+
300
+ # Also ensure layers["counts"] exists for scVI-tools compatibility
301
+ if "counts" not in adata.layers:
302
+ adata.layers["counts"] = adata.X.copy()
303
+
304
+ # Get metadata profile for LLM understanding
305
+ profile = get_adata_profile(adata)
306
+
307
+ # Return dataset info and AnnData object with comprehensive metadata
308
+ return {
309
+ "name": dataset_name,
310
+ "type": data_type,
311
+ "path": data_path,
312
+ "adata": adata,
313
+ "n_cells": n_cells,
314
+ "n_genes": n_genes,
315
+ "spatial_coordinates_available": spatial_coordinates_available,
316
+ "tissue_image_available": tissue_image_available,
317
+ # Metadata profile from adata_utils
318
+ **profile,
319
+ }
320
+
321
+
322
+ def _find_spatial_folder(h5_path: str) -> Optional[str]:
323
+ """
324
+ Intelligently find spatial information folder for a given H5 file.
325
+
326
+ Search strategy:
327
+ 1. Same directory 'spatial' folder
328
+ 2. Parent directory 'spatial' folder
329
+ 3. Same name prefix spatial folder
330
+ 4. Common variations
331
+
332
+ Args:
333
+ h5_path: Path to the H5 file
334
+
335
+ Returns:
336
+ Path to spatial folder if found, None otherwise
337
+ """
338
+ base_dir = os.path.dirname(h5_path)
339
+ base_name = os.path.splitext(os.path.basename(h5_path))[0]
340
+
341
+ # Candidate paths to check
342
+ candidates = [
343
+ os.path.join(base_dir, "spatial"),
344
+ os.path.join(base_dir, "..", "spatial"),
345
+ os.path.join(base_dir, f"{base_name}_spatial"),
346
+ os.path.join(base_dir, "spatial_data"),
347
+ # Check for sample-specific spatial folders
348
+ os.path.join(
349
+ base_dir, base_name.replace("_filtered_feature_bc_matrix", "_spatial")
350
+ ),
351
+ os.path.join(base_dir, base_name.replace("_matrix", "_spatial")),
352
+ ]
353
+
354
+ for candidate in candidates:
355
+ candidate = os.path.normpath(candidate)
356
+ if os.path.exists(candidate) and os.path.isdir(candidate):
357
+ # Verify it contains required spatial files
358
+ required_files = ["tissue_positions_list.csv", "scalefactors_json.json"]
359
+ if all(os.path.exists(os.path.join(candidate, f)) for f in required_files):
360
+ return candidate
361
+
362
+ logger.warning(f"No spatial folder found for {h5_path}")
363
+ return None
364
+
365
+
366
+ def _add_spatial_info_to_adata(adata: Any, spatial_path: str) -> Any:
367
+ """
368
+ Add spatial information to an AnnData object.
369
+
370
+ Args:
371
+ adata: AnnData object with expression data
372
+ spatial_path: Path to spatial information folder
373
+
374
+ Returns:
375
+ AnnData object with spatial information added
376
+ """
377
+ import json
378
+
379
+ import numpy as np
380
+ import pandas as pd
381
+
382
+ try:
383
+ # Load tissue positions
384
+ positions_file = os.path.join(spatial_path, "tissue_positions_list.csv")
385
+
386
+ # Try to detect if file has header
387
+ with open(positions_file, "r") as f:
388
+ first_line = f.readline().strip()
389
+
390
+ if first_line.startswith("barcode"):
391
+ # File has header
392
+ positions = pd.read_csv(positions_file)
393
+ else:
394
+ # File has no header
395
+ positions = pd.read_csv(positions_file, header=None)
396
+
397
+ # Handle different formats of tissue positions file
398
+ if len(positions.columns) == 6:
399
+ positions.columns = [
400
+ "barcode",
401
+ "in_tissue",
402
+ "array_row",
403
+ "array_col",
404
+ "pxl_row_in_fullres",
405
+ "pxl_col_in_fullres",
406
+ ]
407
+ elif len(positions.columns) == 5:
408
+ # Some datasets don't have the 'in_tissue' column
409
+ positions.columns = [
410
+ "barcode",
411
+ "array_row",
412
+ "array_col",
413
+ "pxl_row_in_fullres",
414
+ "pxl_col_in_fullres",
415
+ ]
416
+ positions["in_tissue"] = 1 # Assume all spots are in tissue
417
+ else:
418
+ raise DataCompatibilityError(
419
+ f"Unexpected tissue positions format with {len(positions.columns)} columns"
420
+ )
421
+
422
+ positions.set_index("barcode", inplace=True)
423
+
424
+ # Find common barcodes between expression data and spatial coordinates
425
+ common_barcodes = adata.obs_names.intersection(positions.index)
426
+
427
+ if len(common_barcodes) == 0:
428
+ # Try with modified barcode format (sometimes -1 suffix is added/removed)
429
+ if all("-1" in bc for bc in adata.obs_names[:10]):
430
+ # Expression data has -1 suffix, spatial doesn't
431
+ positions.index = positions.index + "-1"
432
+ elif all("-1" not in bc for bc in adata.obs_names[:10]) and all(
433
+ "-1" in bc for bc in positions.index[:10]
434
+ ):
435
+ # Spatial has -1 suffix, expression doesn't
436
+ positions.index = positions.index.str.replace("-1", "")
437
+
438
+ # Try again
439
+ common_barcodes = adata.obs_names.intersection(positions.index)
440
+
441
+ if len(common_barcodes) == 0:
442
+ raise DataCompatibilityError(
443
+ "No matching barcodes between expression data and spatial coordinates"
444
+ )
445
+
446
+ # Filter to common barcodes
447
+ adata = adata[common_barcodes, :].copy()
448
+ positions = positions.loc[common_barcodes]
449
+
450
+ # Add spatial coordinates
451
+ adata.obsm["spatial"] = positions[
452
+ ["pxl_col_in_fullres", "pxl_row_in_fullres"]
453
+ ].values.astype(float)
454
+
455
+ # Add tissue information
456
+ if "in_tissue" in positions.columns:
457
+ adata.obs["in_tissue"] = positions["in_tissue"].values
458
+
459
+ # Load scalefactors
460
+ scalefactors_file = os.path.join(spatial_path, "scalefactors_json.json")
461
+ with open(scalefactors_file, "r") as f:
462
+ scalefactors = json.load(f)
463
+
464
+ # Generate meaningful library_id from spatial_path
465
+ # Priority: parent directory name (usually sample name) > "sample_1" default
466
+ # Avoid using "spatial" as library_id to prevent confusing adata.uns["spatial"]["spatial"] nesting
467
+ parent_dir = os.path.dirname(spatial_path.rstrip(os.sep))
468
+ if parent_dir and os.path.basename(parent_dir) != "":
469
+ library_id = os.path.basename(parent_dir)
470
+ else:
471
+ library_id = "sample_1" # Fallback to clear default name
472
+
473
+ # Create spatial uns structure (scanpy expects nested structure)
474
+ adata.uns["spatial"] = {
475
+ library_id: {"scalefactors": scalefactors, "images": {}}
476
+ }
477
+
478
+ # Try to load images if available (using centralized dependency manager)
479
+ if is_available("Pillow"):
480
+ from PIL import Image
481
+
482
+ for img_name in ["tissue_hires_image.png", "tissue_lowres_image.png"]:
483
+ img_path = os.path.join(spatial_path, img_name)
484
+ if os.path.exists(img_path):
485
+ try:
486
+ img = np.array(Image.open(img_path))
487
+
488
+ img_key = "hires" if "hires" in img_name else "lowres"
489
+ adata.uns["spatial"][library_id]["images"][img_key] = img
490
+ except Exception as e:
491
+ logger.warning(f"Could not load image {img_name}: {e}")
492
+ else:
493
+ logger.warning("Pillow not available, skipping tissue image loading")
494
+
495
+ return adata
496
+
497
+ except Exception as e:
498
+ logger.error(f"Failed to add spatial information: {e}")
499
+ raise