PyPI - gsMap3D - Versions diffs - 0.1.0a1__py3-none-any.whl - Mend

gsMap3D 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

gsMap/__init__.py +13 -0
gsMap/__main__.py +4 -0
gsMap/cauchy_combination_test.py +342 -0
gsMap/cli.py +355 -0
gsMap/config/__init__.py +72 -0
gsMap/config/base.py +296 -0
gsMap/config/cauchy_config.py +79 -0
gsMap/config/dataclasses.py +235 -0
gsMap/config/decorators.py +302 -0
gsMap/config/find_latent_config.py +276 -0
gsMap/config/format_sumstats_config.py +54 -0
gsMap/config/latent2gene_config.py +461 -0
gsMap/config/ldscore_config.py +261 -0
gsMap/config/quick_mode_config.py +242 -0
gsMap/config/report_config.py +81 -0
gsMap/config/spatial_ldsc_config.py +334 -0
gsMap/config/utils.py +286 -0
gsMap/find_latent/__init__.py +3 -0
gsMap/find_latent/find_latent_representation.py +312 -0
gsMap/find_latent/gnn/distribution.py +498 -0
gsMap/find_latent/gnn/encoder_decoder.py +186 -0
gsMap/find_latent/gnn/gcn.py +85 -0
gsMap/find_latent/gnn/gene_former.py +164 -0
gsMap/find_latent/gnn/loss.py +18 -0
gsMap/find_latent/gnn/st_model.py +125 -0
gsMap/find_latent/gnn/train_step.py +177 -0
gsMap/find_latent/st_process.py +781 -0
gsMap/format_sumstats.py +446 -0
gsMap/generate_ldscore.py +1018 -0
gsMap/latent2gene/__init__.py +18 -0
gsMap/latent2gene/connectivity.py +781 -0
gsMap/latent2gene/entry_point.py +141 -0
gsMap/latent2gene/marker_scores.py +1265 -0
gsMap/latent2gene/memmap_io.py +766 -0
gsMap/latent2gene/rank_calculator.py +590 -0
gsMap/latent2gene/row_ordering.py +182 -0
gsMap/latent2gene/row_ordering_jax.py +159 -0
gsMap/ldscore/__init__.py +1 -0
gsMap/ldscore/batch_construction.py +163 -0
gsMap/ldscore/compute.py +126 -0
gsMap/ldscore/constants.py +70 -0
gsMap/ldscore/io.py +262 -0
gsMap/ldscore/mapping.py +262 -0
gsMap/ldscore/pipeline.py +615 -0
gsMap/pipeline/quick_mode.py +134 -0
gsMap/report/__init__.py +2 -0
gsMap/report/diagnosis.py +375 -0
gsMap/report/report.py +100 -0
gsMap/report/report_data.py +1832 -0
gsMap/report/static/js_lib/alpine.min.js +5 -0
gsMap/report/static/js_lib/tailwindcss.js +83 -0
gsMap/report/static/template.html +2242 -0
gsMap/report/three_d_combine.py +312 -0
gsMap/report/three_d_plot/three_d_plot_decorate.py +246 -0
gsMap/report/three_d_plot/three_d_plot_prepare.py +202 -0
gsMap/report/three_d_plot/three_d_plots.py +425 -0
gsMap/report/visualize.py +1409 -0
gsMap/setup.py +5 -0
gsMap/spatial_ldsc/__init__.py +0 -0
gsMap/spatial_ldsc/io.py +656 -0
gsMap/spatial_ldsc/ldscore_quick_mode.py +912 -0
gsMap/spatial_ldsc/spatial_ldsc_jax.py +382 -0
gsMap/spatial_ldsc/spatial_ldsc_multiple_sumstats.py +439 -0
gsMap/utils/__init__.py +0 -0
gsMap/utils/generate_r2_matrix.py +610 -0
gsMap/utils/jackknife.py +518 -0
gsMap/utils/manhattan_plot.py +643 -0
gsMap/utils/regression_read.py +177 -0
gsMap/utils/torch_utils.py +23 -0
gsmap3d-0.1.0a1.dist-info/METADATA +168 -0
gsmap3d-0.1.0a1.dist-info/RECORD +74 -0
gsmap3d-0.1.0a1.dist-info/WHEEL +4 -0
gsmap3d-0.1.0a1.dist-info/entry_points.txt +2 -0
gsmap3d-0.1.0a1.dist-info/licenses/LICENSE +21 -0

gsMap/config/ldscore_config.py ADDED Viewed

@@ -0,0 +1,261 @@
+"""
+Configuration dataclasses for the general LD score framework.
+"""
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Annotated
+import typer
+from gsMap.config.base import BaseConfig, ConfigWithAutoPaths
+logger = logging.getLogger(__name__)
+@dataclass
+class LDScoreConfig(BaseConfig):
+    """LD Score Weights Configuration"""
+    # Paths
+    bfile_root: Annotated[str, typer.Option(
+        help="Reference panel prefix template (e.g., 'data/1000G.{chr}')"
+    )]
+    hm3_snp_path: Annotated[Path, typer.Option(
+        help="Path to HM3 SNP list",
+        exists=True,
+        file_okay=True,
+        dir_okay=False,
+        resolve_path=True
+    )]
+    output_dir: Annotated[Path | None, typer.Option(
+        help="Output directory. If None, uses {workdir}/{project_name}/generate_ldscore"
+    )] = None
+    output_filename: Annotated[str, typer.Option(
+        help="Prefix for output files"
+    )] = "ld_score_weights"
+    # Omics Input
+    omics_h5ad_path: Annotated[Path | None, typer.Option(
+        help="Path to omics H5AD file",
+        exists=True,
+        file_okay=True,
+        dir_okay=False,
+        resolve_path=True
+    )] = None
+    # Mapping Input (Strategy A/B)
+    mapping_type: Annotated[str, typer.Option(
+         help="Mapping type: 'bed' or 'dict'"
+    )] = "bed"
+    mapping_file: Annotated[Path | None, typer.Option(
+        help="Path to mapping file",
+        exists=True,
+        file_okay=True,
+        dir_okay=False,
+        resolve_path=True
+    )] = None
+    # Annotation Input (Strategy C - Direct Annotation Matrix)
+    annot_file: Annotated[str | None, typer.Option(
+        help="Template for annotation files (e.g., 'baseline.{chr}.annot.gz')"
+    )] = None
+    # Mapping Strategy parameters
+    feature_window_size: Annotated[int, typer.Option(
+        help="bp window for mapping (e.g. TSS window)"
+    )] = 0
+    strategy: Annotated[str, typer.Option(
+        help="Strategy: 'score', 'tss', 'center', 'allow_repeat'"
+    )] = "score"
+    # LD Calculation parameters
+    ld_wind: Annotated[float, typer.Option(
+        help="LD window size"
+    )] = 1.0
+    ld_unit: Annotated[str, typer.Option(
+        help="LD unit: 'SNP', 'KB', 'CM'"
+    )] = "CM"
+    maf_min: Annotated[float, typer.Option(
+        help="Minimum MAF filter"
+    )] = 0.01
+    # Computation
+    chromosomes: Annotated[str, typer.Option(
+        help="Chromosomes to process. 'all' uses 1-22 autosomes, or provide a comma-separated list of chromosomes (e.g., '1,2,3')"
+    )] = "all"
+    batch_size_hm3: Annotated[int, typer.Option(
+        help="Batch size for HM3 SNPs"
+    )] = 50
+    # w_ld Calculation
+    calculate_w_ld: Annotated[bool, typer.Option(
+        help="Whether to calculate w_ld"
+    )] = False
+    w_ld_dir: Annotated[Path | None, typer.Option(
+        help="Directory for w_ld outputs"
+    )] = None
+    def __post_init__(self):
+        """
+        Post-initialization processing:
+        1. Parse chromosome string to list.
+        2. Fix PLINK file template.
+        3. Validate file existence.
+        """
+        # Parse output_dir if provided
+        if self.output_dir is not None:
+            self.output_dir = Path(self.output_dir)
+        else:
+            raise ValueError("output_dir must be provided for LDScoreConfig.")
+        # 3. Parse Chromosomes
+        if self.chromosomes == "all":
+            self.chromosomes = list(range(1, 23))
+        elif isinstance(self.chromosomes, str):
+            # Handle string input like "1,2,22"
+            self.chromosomes = [int(x) for x in self.chromosomes.split(',')]
+        # Else it's already a list or properly set
+        # 4. Handle PLINK Prefix Template
+        # Ensure bfile_root has {chr} placeholder
+        if "{chr}" not in self.bfile_root:
+            logger.warning(
+                f"The 'bfile_root' ({self.bfile_root}) does not contain the '{{chr}}' placeholder. "
+                f"Appending '.{{chr}}' to the prefix."
+            )
+            self.bfile_root = f"{self.bfile_root}.{{chr}}"
+        # 5. Validate PLINK Files
+        logger.info("Validating PLINK binary files based on template...")
+        missing_paths = []
+        for chrom in self.chromosomes:
+            prefix = self.bfile_root.format(chr=chrom)
+            bed_path = Path(f"{prefix}.bed")
+            if not bed_path.exists():
+                missing_paths.append(str(bed_path))
+        if missing_paths:
+            error_msg = (
+                f"PLINK .bed files missing for {len(missing_paths)} chromosomes. "
+                "The following files were not found:\n" + "\n".join(f"  - {p}" for p in missing_paths)
+            )
+            raise FileNotFoundError(error_msg)
+        logger.info(f"Confirmed all PLINK files exist for {len(self.chromosomes)} chromosomes.")
+        self.show_config(LDScoreConfig)
+@dataclass
+class GenerateLDScoreConfig(ConfigWithAutoPaths):
+    """Generate LDScore Configuration"""
+    # Required from parent
+    workdir: Annotated[Path, typer.Option(
+        help="Path to the working directory",
+        exists=True,
+        file_okay=False,
+        dir_okay=True,
+        resolve_path=True
+    )]
+    chrom: Annotated[str, typer.Option(
+        help='Chromosome id (1-22) or "all"'
+    )]
+    bfile_root: Annotated[str, typer.Option(
+        help="Root path for genotype plink bfiles (.bim, .bed, .fam)"
+    )]
+    gtf_annotation_file: Annotated[Path, typer.Option(
+        help="Path to GTF annotation file",
+        exists=True,
+        file_okay=True,
+        dir_okay=False
+    )]
+    sample_name: Annotated[str | None, typer.Option(
+        help="Name of the sample"
+    )] = None
+    keep_snp_root: str | None = None  # Internal field
+    gene_window_size: Annotated[int, typer.Option(
+        help="Gene window size in base pairs",
+        min=1000,
+        max=1000000
+    )] = 50000
+    enhancer_annotation_file: Annotated[Path | None, typer.Option(
+        help="Path to enhancer annotation file",
+        exists=True,
+        file_okay=True,
+        dir_okay=False
+    )] = None
+    snp_multiple_enhancer_strategy: Annotated[str, typer.Option(
+        help="Strategy for handling multiple enhancers per SNP",
+        case_sensitive=False
+    )] = "max_mkscore"
+    gene_window_enhancer_priority: Annotated[str | None, typer.Option(
+        help="Priority between gene window and enhancer annotations"
+    )] = None
+    additional_baseline_annotation: Annotated[str | None, typer.Option(
+        help="Path of additional baseline annotations"
+    )] = None
+    spots_per_chunk: Annotated[int, typer.Option(
+        help="Number of spots per chunk",
+        min=100,
+        max=10000
+    )] = 1000
+    ld_wind: Annotated[int, typer.Option(
+        help="LD window size",
+        min=1,
+        max=10
+    )] = 1
+    ld_unit: Annotated[str, typer.Option(
+        help="Unit for LD window",
+        case_sensitive=False
+    )] = "CM"
+    # Additional fields
+    ldscore_save_format: str = "feather"
+    save_pre_calculate_snp_gene_weight_matrix: bool = False
+    baseline_annotation_dir: str | None = None
+    SNP_gene_pair_dir: str | None = None
+    def __post_init__(self):
+        super().__post_init__()
+        self.show_config(GenerateLDScoreConfig)
+def check_ldscore_done(config: GenerateLDScoreConfig) -> bool:
+    """
+    Check if generate_ldscore step is done.
+    """
+    # Assuming it's done if w_ld directory exists and has files
+    w_ld_dir = Path(config.ldscore_save_dir) / "w_ld"
+    if not w_ld_dir.exists():
+        return False
+    # Check if there are any .l2.ldscore.gz files
+    return any(w_ld_dir.glob("*.l2.ldscore.gz"))

gsMap/config/quick_mode_config.py ADDED Viewed

@@ -0,0 +1,242 @@
+"""
+Configuration for Quick Mode pipeline.
+"""
+import logging
+from dataclasses import dataclass, fields
+from pathlib import Path
+from typing import Annotated
+import typer
+from gsMap.config.base import ConfigWithAutoPaths
+from .cauchy_config import CauchyCombinationConfig
+# Use relative imports to avoid circular dependency
+from .find_latent_config import FindLatentRepresentationsConfig
+from .latent2gene_config import DatasetType, LatentToGeneConfig
+from .report_config import ReportConfig
+from .spatial_ldsc_config import SpatialLDSCConfig
+logger = logging.getLogger("gsMap.config")
+@dataclass
+class QuickModeConfig(ReportConfig, SpatialLDSCConfig, LatentToGeneConfig, FindLatentRepresentationsConfig, ConfigWithAutoPaths):
+    """Quick Mode Pipeline Configuration"""
+    __core_only__ = True
+    # ------------------------------------------------------------------------
+    # Pipeline Control
+    # ------------------------------------------------------------------------
+    start_step: Annotated[str, typer.Option(
+        help="Step to start execution from (find_latent, latent2gene, spatial_ldsc, cauchy, report)",
+        case_sensitive=False
+    )] = "find_latent"
+    stop_step: Annotated[str | None, typer.Option(
+        help="Step to stop execution at (inclusive)",
+        case_sensitive=False
+    )] = None
+    def __post_init__(self):
+        ConfigWithAutoPaths.__post_init__(self)
+        self._init_sumstats()
+        self._init_annotation_list()
+        if self.is_both_latent_and_gene_running:
+            self.high_quality_neighbor_filter = self.high_quality_cell_qc
+            # Use dual embeddings if both steps are running
+            if self.latent_representation_niche is None:
+                self.latent_representation_niche = "emb_niche"
+            if self.latent_representation_cell is None:
+                self.latent_representation_cell = "emb_cell"
+        self.show_config(QuickModeConfig)
+    @property
+    def is_both_latent_and_gene_running(self) -> bool:
+        """Check if both find_latent and latent2gene are in the execution range."""
+        steps = ["find_latent", "latent2gene", "spatial_ldsc", "cauchy", "report"]
+        try:
+            start_idx = steps.index(self.start_step)
+            stop_idx = steps.index(self.stop_step) if self.stop_step else len(steps) - 1
+            return start_idx <= 0 and stop_idx >= 1
+        except ValueError:
+            return False
+    @property
+    def find_latent_config(self) -> FindLatentRepresentationsConfig:
+        return FindLatentRepresentationsConfig(**{
+            f.name: getattr(self, f.name) for f in fields(FindLatentRepresentationsConfig) if f.init
+        })
+    @property
+    def latent2gene_config(self) -> LatentToGeneConfig:
+        params = {f.name: getattr(self, f.name) for f in fields(LatentToGeneConfig) if f.init}
+        return LatentToGeneConfig(**params)
+    @property
+    def spatial_ldsc_config(self) -> SpatialLDSCConfig:
+        return SpatialLDSCConfig(**{
+            f.name: getattr(self, f.name) for f in fields(SpatialLDSCConfig) if f.init
+        })
+    @property
+    def report_config(self) -> ReportConfig:
+        return ReportConfig(**{
+            f.name: getattr(self, f.name) for f in fields(ReportConfig) if f.init and hasattr(self, f.name)
+        })
+    @property
+    def cauchy_config(self) -> CauchyCombinationConfig:
+        return CauchyCombinationConfig(**{
+            f.name: getattr(self, f.name) for f in fields(CauchyCombinationConfig) if f.init and hasattr(self, f.name)
+        })
+def check_report_done(config: QuickModeConfig, verbose: bool = False) -> bool:
+    missing_data_files, missing_web_files = get_report_missing_files(config)
+    missing_files = missing_data_files + missing_web_files
+    if missing_files and verbose:
+        logger.info(f"Report incomplete. Missing {len(missing_files)} files:")
+        for f in missing_files[:10]:  # Show first 10
+            logger.info(f"  - {f}")
+        if len(missing_files) > 10:
+            logger.info(f"  ... and {len(missing_files) - 10} more")
+    return len(missing_files) == 0
+def get_report_missing_files(config: QuickModeConfig) -> tuple[list[Path], list[Path]]:
+    """
+    Get lists of missing report files, categorized by type.
+    Returns:
+        Tuple of (missing_data_files, missing_web_files)
+    """
+    missing_data_files = []
+    missing_web_files = []
+    # Get dataset type (default to spatial2D if not found)
+    dataset_type = config.dataset_type
+    # === Web Report Files ===
+    web_report_dir = config.web_report_dir
+    js_data_dir = web_report_dir / "js_data"
+    core_web_files = [
+        web_report_dir / "index.html",
+        web_report_dir / "report_meta.json",
+        js_data_dir / "sample_index.js",
+        js_data_dir / "report_meta.js",
+        js_data_dir / "cauchy_results.js",
+    ]
+    for f in core_web_files:
+        if not f.exists():
+            missing_web_files.append(f)
+    # === Data Files ===
+    report_data_dir = config.report_data_dir
+    core_data_files = [
+        report_data_dir / "spot_metadata.csv",
+        report_data_dir / "gene_list.csv",
+        report_data_dir / "cauchy_results.csv",
+    ]
+    for f in core_data_files:
+        if not f.exists():
+            missing_data_files.append(f)
+    # Per-trait files
+    traits = config.trait_name_list
+    annotation_list = config.annotation_list
+    for trait in traits:
+        trait_gss_csv = report_data_dir / "gss_stats" / f"gene_trait_correlation_{trait}.csv"
+        trait_manhattan_csv = report_data_dir / "manhattan_data" / f"{trait}_manhattan.csv"
+        if not trait_gss_csv.exists():
+            missing_data_files.append(trait_gss_csv)
+        if not trait_manhattan_csv.exists():
+            missing_data_files.append(trait_manhattan_csv)
+        trait_gss_js = js_data_dir / "gss_stats" / f"gene_trait_correlation_{trait}.js"
+        trait_manhattan_js = js_data_dir / f"manhattan_{trait}.js"
+        if not trait_gss_js.exists():
+            missing_web_files.append(trait_gss_js)
+        if not trait_manhattan_js.exists():
+            missing_web_files.append(trait_manhattan_js)
+    # Per-sample spatial JS files (Spatial only)
+    if dataset_type in (DatasetType.SPATIAL_2D, DatasetType.SPATIAL_3D):
+        sample_h5ad_dict = config.sample_h5ad_dict
+        if sample_h5ad_dict:
+            for sample_name in sample_h5ad_dict.keys():
+                safe_name = "".join(c if c.isalnum() else "_" for c in sample_name)
+                sample_js = js_data_dir / f"sample_{safe_name}_spatial.js"
+                if not sample_js.exists():
+                    missing_web_files.append(sample_js)
+        # Gene diagnostic plots directory (Spatial only)
+        gene_plot_dir = web_report_dir / "gene_diagnostic_plots"
+        if not gene_plot_dir.exists():
+            missing_web_files.append(gene_plot_dir)
+    # 3D specific files (Spatial 3D only)
+    if dataset_type == DatasetType.SPATIAL_3D:
+        three_d_data_dir = report_data_dir / "spatial_3d"
+        three_d_web_dir = web_report_dir / "spatial_3d"
+        # 3D H5AD file
+        h5ad_3d = three_d_data_dir / "spatial_3d.h5ad"
+        if not h5ad_3d.exists():
+            missing_data_files.append(h5ad_3d)
+        # 3D HTML plot files (one per trait/annotation)
+        for trait in traits:
+            safe_trait = "".join(c if c.isalnum() else "_" for c in trait)
+            trait_3d_html = three_d_web_dir / f"spatial_3d_trait_{safe_trait}.html"
+            if not trait_3d_html.exists():
+                missing_web_files.append(trait_3d_html)
+        for anno in annotation_list:
+            safe_anno = "".join(c if c.isalnum() else "_" for c in anno)
+            anno_3d_html = three_d_web_dir / f"spatial_3d_anno_{safe_anno}.html"
+            if not anno_3d_html.exists():
+                missing_web_files.append(anno_3d_html)
+    # Multi-sample plots (Spatial only, and if enabled)
+    if dataset_type != DatasetType.SCRNA_SEQ and config.generate_multi_sample_plots:
+        spatial_plot_dir = web_report_dir / "spatial_plots"
+        annotation_plot_dir = web_report_dir / "annotation_plots"
+        for trait in traits:
+            plot_path = spatial_plot_dir / f"ldsc_{trait}.png"
+            if not plot_path.exists():
+                missing_web_files.append(plot_path)
+        for anno in annotation_list:
+            plot_path = annotation_plot_dir / f"anno_{anno}.png"
+            if not plot_path.exists():
+                missing_web_files.append(plot_path)
+    # UMAP data (optional)
+    concat_adata_path = config.concatenated_latent_adata_path
+    if concat_adata_path and concat_adata_path.exists():
+        umap_file = report_data_dir / "umap_data.csv"
+        umap_js = js_data_dir / "umap_data.js"
+        if not umap_file.exists():
+            missing_data_files.append(umap_file)
+        if not umap_js.exists():
+            missing_web_files.append(umap_js)
+    return missing_data_files, missing_web_files

gsMap/config/report_config.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""
+Configuration for generating reports.
+"""
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Annotated
+import typer
+from .base import ConfigWithAutoPaths, ensure_path_exists
+from .cauchy_config import CauchyCombinationConfig
+logger = logging.getLogger("gsMap.config")
+@dataclass
+class ReportConfig(CauchyCombinationConfig,ConfigWithAutoPaths):
+    """Report Generation Configuration"""
+    downsampling_n_spots_pcc: Annotated[int, typer.Option(
+        help="Number of spots to downsample for PCC calculation if n_spots > this value",
+        min=1000,
+        max=100000
+    )] = 20000
+    downsampling_n_spots_3d: Annotated[int, typer.Option(
+        help="Number of spots to downsample for 3D visualization if n_spots > this value",
+        min=1000,
+        max=2000000
+    )] = 1000000
+    downsampling_n_spots_2d: Annotated[int, typer.Option(
+        help="Max spots per sample for 2D distribution plots. Samples with more spots will be randomly downsampled.",
+        min=10000,
+        max=500000
+    )] = 250000
+    top_corr_genes: Annotated[int, typer.Option(
+        help="Number of top correlated genes to display",
+        min=1,
+        max=500
+    )] = 50
+    # Advanced visualization parameters
+    single_sample_multi_trait_max_cols: int = 5
+    subsample_n_points: int | None = None
+    single_sample_multi_trait_subplot_width_inches: float = 4.0
+    single_sample_multi_trait_dpi: int = 300
+    enable_pdf_output: bool = True
+    hover_text_list: list | None = None
+    single_trait_multi_sample_max_cols: int = 8
+    single_trait_multi_sample_subplot_width_inches: float = 4.0
+    single_trait_multi_sample_scaling_factor: float = 1.0
+    single_trait_multi_sample_dpi: int = 300
+    share_coords: bool = False
+    # Weather to generate single-feature multi-sample plots (LDSC, annotation, and gene diagnostic plots)
+    generate_multi_sample_plots: bool = False
+    # Plot origin for spatial plots ('upper' or 'lower')
+    plot_origin: Annotated[str, typer.Option(
+        help="Plot origin for spatial plots ('upper' or 'lower'). 'upper' will flip the y-axis (standard for images)."
+    )] = "upper"
+    # Legend marker size for categorical plots
+    legend_marker_size: float = 10.0
+    # Force re-run of report generation even if results exist
+    force_report_re_run: bool = False
+    # Compatibility properties for visualization paths
+    @property
+    @ensure_path_exists
+    def visualization_result_dir(self) -> Path:
+        return self.project_dir / "report" / self.project_name / (self.trait_name or "multi_trait")
+    def __post_init__(self):
+        CauchyCombinationConfig.__post_init__(self)
+        self.show_config(ReportConfig)