PyPI - chatspatial - Versions diffs - 1.1.0__py3-none-any.whl - Mend

chatspatial 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

chatspatial/__init__.py +11 -0
chatspatial/__main__.py +141 -0
chatspatial/cli/__init__.py +7 -0
chatspatial/config.py +53 -0
chatspatial/models/__init__.py +85 -0
chatspatial/models/analysis.py +513 -0
chatspatial/models/data.py +2462 -0
chatspatial/server.py +1763 -0
chatspatial/spatial_mcp_adapter.py +720 -0
chatspatial/tools/__init__.py +3 -0
chatspatial/tools/annotation.py +1903 -0
chatspatial/tools/cell_communication.py +1603 -0
chatspatial/tools/cnv_analysis.py +605 -0
chatspatial/tools/condition_comparison.py +595 -0
chatspatial/tools/deconvolution/__init__.py +402 -0
chatspatial/tools/deconvolution/base.py +318 -0
chatspatial/tools/deconvolution/card.py +244 -0
chatspatial/tools/deconvolution/cell2location.py +326 -0
chatspatial/tools/deconvolution/destvi.py +144 -0
chatspatial/tools/deconvolution/flashdeconv.py +101 -0
chatspatial/tools/deconvolution/rctd.py +317 -0
chatspatial/tools/deconvolution/spotlight.py +216 -0
chatspatial/tools/deconvolution/stereoscope.py +109 -0
chatspatial/tools/deconvolution/tangram.py +135 -0
chatspatial/tools/differential.py +625 -0
chatspatial/tools/embeddings.py +298 -0
chatspatial/tools/enrichment.py +1863 -0
chatspatial/tools/integration.py +807 -0
chatspatial/tools/preprocessing.py +723 -0
chatspatial/tools/spatial_domains.py +808 -0
chatspatial/tools/spatial_genes.py +836 -0
chatspatial/tools/spatial_registration.py +441 -0
chatspatial/tools/spatial_statistics.py +1476 -0
chatspatial/tools/trajectory.py +495 -0
chatspatial/tools/velocity.py +405 -0
chatspatial/tools/visualization/__init__.py +155 -0
chatspatial/tools/visualization/basic.py +393 -0
chatspatial/tools/visualization/cell_comm.py +699 -0
chatspatial/tools/visualization/cnv.py +320 -0
chatspatial/tools/visualization/core.py +684 -0
chatspatial/tools/visualization/deconvolution.py +852 -0
chatspatial/tools/visualization/enrichment.py +660 -0
chatspatial/tools/visualization/integration.py +205 -0
chatspatial/tools/visualization/main.py +164 -0
chatspatial/tools/visualization/multi_gene.py +739 -0
chatspatial/tools/visualization/persistence.py +335 -0
chatspatial/tools/visualization/spatial_stats.py +469 -0
chatspatial/tools/visualization/trajectory.py +639 -0
chatspatial/tools/visualization/velocity.py +411 -0
chatspatial/utils/__init__.py +115 -0
chatspatial/utils/adata_utils.py +1372 -0
chatspatial/utils/compute.py +327 -0
chatspatial/utils/data_loader.py +499 -0
chatspatial/utils/dependency_manager.py +462 -0
chatspatial/utils/device_utils.py +165 -0
chatspatial/utils/exceptions.py +185 -0
chatspatial/utils/image_utils.py +267 -0
chatspatial/utils/mcp_utils.py +137 -0
chatspatial/utils/path_utils.py +243 -0
chatspatial/utils/persistence.py +78 -0
chatspatial/utils/scipy_compat.py +143 -0
chatspatial-1.1.0.dist-info/METADATA +242 -0
chatspatial-1.1.0.dist-info/RECORD +67 -0
chatspatial-1.1.0.dist-info/WHEEL +5 -0
chatspatial-1.1.0.dist-info/entry_points.txt +2 -0
chatspatial-1.1.0.dist-info/licenses/LICENSE +21 -0
chatspatial-1.1.0.dist-info/top_level.txt +1 -0

chatspatial/utils/data_loader.py ADDED Viewed

@@ -0,0 +1,499 @@
+"""
+Data loading utilities for spatial transcriptomics data.
+Handles loading various spatial data formats:
+- H5AD files (AnnData format)
+- H5 files (10x Genomics format)
+- MTX directories (10x Visium structure)
+- Visium directories with spatial information
+For data persistence, see persistence.py.
+"""
+import logging
+import os
+from typing import Any, Literal, Optional
+from .adata_utils import ensure_unique_var_names, get_adata_profile
+from .dependency_manager import is_available
+from .exceptions import (
+    DataCompatibilityError,
+    DataNotFoundError,
+    ParameterError,
+    ProcessingError,
+)
+logger = logging.getLogger(__name__)
+async def load_spatial_data(
+    data_path: str,
+    data_type: Literal[
+        "10x_visium", "slide_seq", "merfish", "seqfish", "other", "auto", "h5ad"
+    ] = "auto",
+    name: Optional[str] = None,
+) -> dict[str, Any]:
+    """Load spatial transcriptomics data
+    Args:
+        data_path: Path to the data file or directory
+        data_type: Type of spatial data. If 'auto', will try to determine the type from the file extension or directory structure.
+        name: Optional name for the dataset
+    Returns:
+        Dictionary with dataset information and AnnData object
+    """
+    # Validate path
+    if not os.path.exists(data_path):
+        raise FileNotFoundError(f"Data path not found: {data_path}")
+    # Auto-detect data type if set to 'auto'
+    if data_type == "auto":
+        if os.path.isfile(data_path):
+            if data_path.endswith(".h5ad"):
+                # It's an h5ad file
+                data_type = "h5ad"
+            elif data_path.endswith(".h5"):
+                # It's likely a 10x H5 file
+                data_type = "10x_visium"
+            else:
+                # Default to other for unknown file types
+                data_type = "other"
+        elif os.path.isdir(data_path):
+            # Check if it has the structure of a 10x Visium dataset
+            if os.path.exists(
+                os.path.join(data_path, "filtered_feature_bc_matrix")
+            ) or os.path.exists(
+                os.path.join(data_path, "filtered_feature_bc_matrix.h5")
+            ):
+                data_type = "10x_visium"
+            else:
+                # Default to other if we can't determine
+                data_type = "other"
+        else:
+            # Default to other for unknown file types
+            data_type = "other"
+    # Convert h5ad to other for backward compatibility
+    if data_type == "h5ad":
+        data_type = "other"
+    # Import dependencies
+    import scanpy as sc
+    import squidpy as sq
+    # Load data based on data_type
+    if data_type == "10x_visium":
+        # For 10x Visium, we need to provide the path to the directory containing the data
+        try:
+            # Check if it's a directory or an h5ad file
+            if os.path.isdir(data_path):
+                # Check if the directory has the expected structure
+                if os.path.exists(
+                    os.path.join(data_path, "filtered_feature_bc_matrix.h5")
+                ):
+                    # H5 file based 10x Visium directory structure
+                    adata = sc.read_visium(data_path)
+                elif os.path.exists(
+                    os.path.join(data_path, "filtered_feature_bc_matrix")
+                ):
+                    # Check if it contains MTX files (compressed or uncompressed)
+                    mtx_dir = os.path.join(data_path, "filtered_feature_bc_matrix")
+                    if os.path.exists(
+                        os.path.join(mtx_dir, "matrix.mtx.gz")
+                    ) or os.path.exists(os.path.join(mtx_dir, "matrix.mtx")):
+                        # Matrix files based 10x Visium directory structure
+                        # Use scanpy's read_10x_mtx function
+                        adata = sc.read_10x_mtx(
+                            os.path.join(data_path, "filtered_feature_bc_matrix"),
+                            var_names="gene_symbols",
+                            cache=False,
+                        )
+                        # Try to load spatial coordinates if available
+                        spatial_dir = os.path.join(data_path, "spatial")
+                        if os.path.exists(spatial_dir):
+                            try:
+                                # Add spatial information manually
+                                import json
+                                import pandas as pd
+                                # Load tissue positions
+                                positions_path = os.path.join(
+                                    spatial_dir, "tissue_positions_list.csv"
+                                )
+                                if os.path.exists(positions_path):
+                                    # Try to detect if file has header
+                                    with open(positions_path, "r") as f:
+                                        first_line = f.readline().strip()
+                                    if first_line.startswith("barcode"):
+                                        # File has header
+                                        positions = pd.read_csv(positions_path)
+                                    else:
+                                        # File has no header
+                                        positions = pd.read_csv(
+                                            positions_path, header=None
+                                        )
+                                        positions.columns = [
+                                            "barcode",
+                                            "in_tissue",
+                                            "array_row",
+                                            "array_col",
+                                            "pxl_row_in_fullres",
+                                            "pxl_col_in_fullres",
+                                        ]
+                                    positions.set_index("barcode", inplace=True)
+                                    # Filter for spots in tissue
+                                    positions = positions[positions["in_tissue"] == 1]
+                                    # Add spatial coordinates to adata
+                                    adata.obsm["spatial"] = positions.loc[
+                                        adata.obs_names,
+                                        ["pxl_col_in_fullres", "pxl_row_in_fullres"],
+                                    ].values
+                                    # Load scalefactors
+                                    scalefactors_path = os.path.join(
+                                        spatial_dir, "scalefactors_json.json"
+                                    )
+                                    if os.path.exists(scalefactors_path):
+                                        with open(scalefactors_path, "r") as f:
+                                            scalefactors = json.load(f)
+                                        # Add scalefactors to adata
+                                        adata.uns["spatial"] = {
+                                            "scalefactors": scalefactors
+                                        }
+                            except Exception as e:
+                                logger.warning(
+                                    f"Could not load spatial information: {e}"
+                                )
+                else:
+                    raise DataCompatibilityError(
+                        f"Directory {data_path} does not have the expected 10x Visium structure"
+                    )
+            elif os.path.isfile(data_path) and data_path.endswith(".h5"):
+                # Single H5 file - new support for 10x H5 format
+                adata = sc.read_10x_h5(data_path)
+                # Try to find and add spatial information
+                spatial_path = _find_spatial_folder(data_path)
+                if spatial_path:
+                    try:
+                        adata = _add_spatial_info_to_adata(adata, spatial_path)
+                    except Exception as e:
+                        logger.warning(f"Could not add spatial information: {e}")
+            elif os.path.isfile(data_path) and data_path.endswith(".h5ad"):
+                # If it's an h5ad file but marked as 10x_visium, read it as h5ad
+                adata = sc.read_h5ad(data_path)
+                # Check if it has the necessary spatial information
+                if "spatial" not in adata.uns and not any(
+                    "spatial" in key for key in adata.obsm.keys()
+                ):
+                    logger.warning(
+                        "The h5ad file does not contain spatial information typically required for 10x Visium data"
+                    )
+            else:
+                raise ParameterError(
+                    f"Unsupported file format for 10x_visium: {data_path}. Supported formats: directory with Visium structure, .h5 file, or .h5ad file"
+                )
+            # Add spatial neighborhood graph if not already present
+            if "spatial_connectivities" not in adata.obsp and "spatial" in adata.obsm:
+                try:
+                    sq.gr.spatial_neighbors(adata)
+                except Exception as e:
+                    logger.warning(f"Could not compute spatial neighbors: {e}")
+        except FileNotFoundError as e:
+            raise DataNotFoundError(f"File not found: {e}") from e
+        except Exception as e:
+            # Provide more detailed error information
+            error_msg = f"Error loading 10x Visium data from {data_path}: {e}"
+            # Add helpful suggestions based on error type
+            if "No matching barcodes" in str(e):
+                error_msg += "\n\nPossible solutions:"
+                error_msg += "\n1. Check if the H5 file and spatial coordinates are from the same sample"
+                error_msg += "\n2. Verify barcode format (with or without -1 suffix)"
+                error_msg += "\n3. Ensure the spatial folder contains the correct tissue_positions_list.csv file"
+            elif ".h5" in data_path and "read_10x_h5" in str(e):
+                error_msg += "\n\nThis might not be a valid 10x H5 file. Try:"
+                error_msg += "\n1. Set data_type='h5ad' if this is an AnnData H5AD file"
+                error_msg += (
+                    "\n2. Verify the file is from 10x Genomics Cell Ranger output"
+                )
+            elif "spatial" in str(e).lower():
+                error_msg += "\n\nSpatial data issue detected. Try:"
+                error_msg += (
+                    "\n1. Loading without spatial data by using data_type='other'"
+                )
+                error_msg += "\n2. Ensuring spatial folder contains: tissue_positions_list.csv and scalefactors_json.json"
+            raise ProcessingError(error_msg) from e
+    elif data_type == "h5ad" or data_type in [
+        "slide_seq",
+        "merfish",
+        "seqfish",
+        "other",
+    ]:
+        # For h5ad files or other data types
+        try:
+            adata = sc.read_h5ad(data_path)
+        except Exception as e:
+            raise ProcessingError(f"Error loading {data_type} data: {e}") from e
+    else:
+        raise ParameterError(f"Unsupported data type: {data_type}")
+    # Set dataset name
+    dataset_name = name or os.path.basename(data_path).split(".")[0]
+    # Calculate basic statistics
+    n_cells = adata.n_obs
+    n_genes = adata.n_vars
+    # Check if spatial coordinates are available
+    # Priority: obsm["spatial"] is the actual coordinate storage location
+    # uns["spatial"] only contains metadata (scalefactors, images) not coordinates
+    spatial_coordinates_available = (
+        hasattr(adata, "obsm")
+        and "spatial" in adata.obsm
+        and adata.obsm["spatial"] is not None
+        and len(adata.obsm["spatial"]) > 0
+    )
+    # Check if tissue image is available (for Visium data)
+    # Structure: adata.uns["spatial"][library_id]["images"]["hires"/"lowres"]
+    # Must check for actual hires or lowres images, not just non-empty dict
+    tissue_image_available = False
+    if "spatial" in adata.uns and isinstance(adata.uns["spatial"], dict):
+        for _sample_key, sample_data in adata.uns["spatial"].items():
+            # Each sample_data should be a dict with "images" key
+            if isinstance(sample_data, dict) and "images" in sample_data:
+                images_dict = sample_data["images"]
+                # Check if images dict has actual hires or lowres images
+                if isinstance(images_dict, dict) and (
+                    "hires" in images_dict or "lowres" in images_dict
+                ):
+                    tissue_image_available = True
+                    break
+    # Make variable names unique to avoid reindexing issues
+    ensure_unique_var_names(adata)
+    # Preserve raw data for downstream analysis (C2 strategy)
+    # Only save if .raw doesn't already exist - respect user's existing .raw
+    import anndata as ad
+    if adata.raw is None:
+        # Save current data state to .raw
+        # This ensures downstream tools always have access to original loaded data
+        adata.raw = ad.AnnData(
+            X=adata.X.copy(),
+            var=adata.var,
+            obs=adata.obs.copy(),
+            uns={},
+        )
+    # Also ensure layers["counts"] exists for scVI-tools compatibility
+    if "counts" not in adata.layers:
+        adata.layers["counts"] = adata.X.copy()
+    # Get metadata profile for LLM understanding
+    profile = get_adata_profile(adata)
+    # Return dataset info and AnnData object with comprehensive metadata
+    return {
+        "name": dataset_name,
+        "type": data_type,
+        "path": data_path,
+        "adata": adata,
+        "n_cells": n_cells,
+        "n_genes": n_genes,
+        "spatial_coordinates_available": spatial_coordinates_available,
+        "tissue_image_available": tissue_image_available,
+        # Metadata profile from adata_utils
+        **profile,
+    }
+def _find_spatial_folder(h5_path: str) -> Optional[str]:
+    """
+    Intelligently find spatial information folder for a given H5 file.
+    Search strategy:
+    1. Same directory 'spatial' folder
+    2. Parent directory 'spatial' folder
+    3. Same name prefix spatial folder
+    4. Common variations
+    Args:
+        h5_path: Path to the H5 file
+    Returns:
+        Path to spatial folder if found, None otherwise
+    """
+    base_dir = os.path.dirname(h5_path)
+    base_name = os.path.splitext(os.path.basename(h5_path))[0]
+    # Candidate paths to check
+    candidates = [
+        os.path.join(base_dir, "spatial"),
+        os.path.join(base_dir, "..", "spatial"),
+        os.path.join(base_dir, f"{base_name}_spatial"),
+        os.path.join(base_dir, "spatial_data"),
+        # Check for sample-specific spatial folders
+        os.path.join(
+            base_dir, base_name.replace("_filtered_feature_bc_matrix", "_spatial")
+        ),
+        os.path.join(base_dir, base_name.replace("_matrix", "_spatial")),
+    ]
+    for candidate in candidates:
+        candidate = os.path.normpath(candidate)
+        if os.path.exists(candidate) and os.path.isdir(candidate):
+            # Verify it contains required spatial files
+            required_files = ["tissue_positions_list.csv", "scalefactors_json.json"]
+            if all(os.path.exists(os.path.join(candidate, f)) for f in required_files):
+                return candidate
+    logger.warning(f"No spatial folder found for {h5_path}")
+    return None
+def _add_spatial_info_to_adata(adata: Any, spatial_path: str) -> Any:
+    """
+    Add spatial information to an AnnData object.
+    Args:
+        adata: AnnData object with expression data
+        spatial_path: Path to spatial information folder
+    Returns:
+        AnnData object with spatial information added
+    """
+    import json
+    import numpy as np
+    import pandas as pd
+    try:
+        # Load tissue positions
+        positions_file = os.path.join(spatial_path, "tissue_positions_list.csv")
+        # Try to detect if file has header
+        with open(positions_file, "r") as f:
+            first_line = f.readline().strip()
+        if first_line.startswith("barcode"):
+            # File has header
+            positions = pd.read_csv(positions_file)
+        else:
+            # File has no header
+            positions = pd.read_csv(positions_file, header=None)
+            # Handle different formats of tissue positions file
+            if len(positions.columns) == 6:
+                positions.columns = [
+                    "barcode",
+                    "in_tissue",
+                    "array_row",
+                    "array_col",
+                    "pxl_row_in_fullres",
+                    "pxl_col_in_fullres",
+                ]
+            elif len(positions.columns) == 5:
+                # Some datasets don't have the 'in_tissue' column
+                positions.columns = [
+                    "barcode",
+                    "array_row",
+                    "array_col",
+                    "pxl_row_in_fullres",
+                    "pxl_col_in_fullres",
+                ]
+                positions["in_tissue"] = 1  # Assume all spots are in tissue
+            else:
+                raise DataCompatibilityError(
+                    f"Unexpected tissue positions format with {len(positions.columns)} columns"
+                )
+        positions.set_index("barcode", inplace=True)
+        # Find common barcodes between expression data and spatial coordinates
+        common_barcodes = adata.obs_names.intersection(positions.index)
+        if len(common_barcodes) == 0:
+            # Try with modified barcode format (sometimes -1 suffix is added/removed)
+            if all("-1" in bc for bc in adata.obs_names[:10]):
+                # Expression data has -1 suffix, spatial doesn't
+                positions.index = positions.index + "-1"
+            elif all("-1" not in bc for bc in adata.obs_names[:10]) and all(
+                "-1" in bc for bc in positions.index[:10]
+            ):
+                # Spatial has -1 suffix, expression doesn't
+                positions.index = positions.index.str.replace("-1", "")
+            # Try again
+            common_barcodes = adata.obs_names.intersection(positions.index)
+        if len(common_barcodes) == 0:
+            raise DataCompatibilityError(
+                "No matching barcodes between expression data and spatial coordinates"
+            )
+        # Filter to common barcodes
+        adata = adata[common_barcodes, :].copy()
+        positions = positions.loc[common_barcodes]
+        # Add spatial coordinates
+        adata.obsm["spatial"] = positions[
+            ["pxl_col_in_fullres", "pxl_row_in_fullres"]
+        ].values.astype(float)
+        # Add tissue information
+        if "in_tissue" in positions.columns:
+            adata.obs["in_tissue"] = positions["in_tissue"].values
+        # Load scalefactors
+        scalefactors_file = os.path.join(spatial_path, "scalefactors_json.json")
+        with open(scalefactors_file, "r") as f:
+            scalefactors = json.load(f)
+        # Generate meaningful library_id from spatial_path
+        # Priority: parent directory name (usually sample name) > "sample_1" default
+        # Avoid using "spatial" as library_id to prevent confusing adata.uns["spatial"]["spatial"] nesting
+        parent_dir = os.path.dirname(spatial_path.rstrip(os.sep))
+        if parent_dir and os.path.basename(parent_dir) != "":
+            library_id = os.path.basename(parent_dir)
+        else:
+            library_id = "sample_1"  # Fallback to clear default name
+        # Create spatial uns structure (scanpy expects nested structure)
+        adata.uns["spatial"] = {
+            library_id: {"scalefactors": scalefactors, "images": {}}
+        }
+        # Try to load images if available (using centralized dependency manager)
+        if is_available("Pillow"):
+            from PIL import Image
+            for img_name in ["tissue_hires_image.png", "tissue_lowres_image.png"]:
+                img_path = os.path.join(spatial_path, img_name)
+                if os.path.exists(img_path):
+                    try:
+                        img = np.array(Image.open(img_path))
+                        img_key = "hires" if "hires" in img_name else "lowres"
+                        adata.uns["spatial"][library_id]["images"][img_key] = img
+                    except Exception as e:
+                        logger.warning(f"Could not load image {img_name}: {e}")
+        else:
+            logger.warning("Pillow not available, skipping tissue image loading")
+        return adata
+    except Exception as e:
+        logger.error(f"Failed to add spatial information: {e}")
+        raise