PyPI - seqmat - Versions diffs - 0.1.0__py3-none-any.whl - Mend

seqmat 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

seqmat/__init__.py +46 -0
seqmat/cli.py +280 -0
seqmat/config.py +100 -0
seqmat/gene.py +178 -0
seqmat/seqmat.py +478 -0
seqmat/transcript.py +319 -0
seqmat/utils.py +686 -0
seqmat-0.1.0.dist-info/METADATA +774 -0
seqmat-0.1.0.dist-info/RECORD +16 -0
seqmat-0.1.0.dist-info/WHEEL +5 -0
seqmat-0.1.0.dist-info/entry_points.txt +3 -0
seqmat-0.1.0.dist-info/licenses/LICENSE +21 -0
seqmat-0.1.0.dist-info/top_level.txt +2 -0
tests/__init__.py +1 -0
tests/test_gene.py +217 -0
tests/test_seqmat.py +128 -0

seqmat/__init__.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""
+SeqMat - Lightning-fast genomic sequence matrix library
+A comprehensive Python library for genomic sequence analysis with full mutation tracking,
+splicing analysis, and sequence manipulation.
+"""
+__version__ = "0.1.0"
+__author__ = "Nicolas Lynn Vila"
+__email__ = "nicolasalynn@gmail.com"
+from .seqmat import SeqMat
+from .gene import Gene
+from .transcript import Transcript
+from .utils import (
+    setup_genomics_data,
+    load_config,
+    save_config,
+    list_available_organisms,
+    list_supported_organisms,
+    get_organism_info,
+    list_gene_biotypes,
+    count_genes,
+    get_gene_list,
+    data_summary,
+    print_data_summary,
+    search_genes
+)
+__all__ = [
+    "SeqMat",
+    "Gene",
+    "Transcript",
+    "setup_genomics_data",
+    "load_config",
+    "save_config",
+    "list_available_organisms",
+    "list_supported_organisms",
+    "get_organism_info",
+    "list_gene_biotypes",
+    "count_genes",
+    "get_gene_list",
+    "data_summary",
+    "print_data_summary",
+    "search_genes"
+]

seqmat/cli.py ADDED Viewed

@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+"""Command-line interface for SeqMat data management"""
+import argparse
+import sys
+from typing import Optional
+from .utils import (
+    setup_genomics_data,
+    print_data_summary,
+    list_available_organisms,
+    list_supported_organisms,
+    list_gene_biotypes,
+    count_genes,
+    get_gene_list,
+    search_genes,
+    get_organism_info
+)
+from .config import get_available_organisms, get_default_organism, get_organism_info as get_organism_config_info
+def cmd_setup(args):
+    """Setup genomics data for an organism"""
+    try:
+        setup_genomics_data(
+            basepath=args.path,
+            organism=args.organism,
+            force=args.force
+        )
+        print(f"✅ Successfully set up {args.organism} data in {args.path}")
+    except Exception as e:
+        print(f"❌ Error setting up data: {e}")
+        sys.exit(1)
+def cmd_list_organisms(args):
+    """List available and supported organisms"""
+    print("🌍 Organism Support Status:")
+    print("-" * 30)
+    supported = list_supported_organisms()
+    configured = list_available_organisms()
+    # Get organism names from config
+    organism_names = {}
+    for org in set(supported + configured):
+        try:
+            info = get_organism_config_info(org)
+            organism_names[org] = info.get('name', org)
+        except:
+            organism_names[org] = org
+    for org in supported:
+        name = organism_names.get(org, org)
+        status = "✅ Configured" if org in configured else "❌ Not configured"
+        print(f"{org}: {name} - {status}")
+    if not configured:
+        print("\nTo set up data, run:")
+        print("  seqmat-setup --path /your/data/path --organism hg38")
+def cmd_summary(args):
+    """Print data summary"""
+    print_data_summary()
+def cmd_biotypes(args):
+    """List gene biotypes for an organism"""
+    if not args.organism:
+        print("❌ Please specify an organism with --organism")
+        sys.exit(1)
+    biotypes = list_gene_biotypes(args.organism)
+    if not biotypes:
+        print(f"❌ No data found for organism '{args.organism}'")
+        print("Available organisms:", ", ".join(list_available_organisms()))
+        sys.exit(1)
+    print(f"📊 Gene biotypes in {args.organism}:")
+    print("-" * 30)
+    # Get counts for each biotype
+    counts = count_genes(args.organism)
+    for biotype in biotypes:
+        count = counts.get(biotype, 0)
+        print(f"{biotype}: {count:,} genes")
+def cmd_count(args):
+    """Count genes for an organism/biotype"""
+    if not args.organism:
+        print("❌ Please specify an organism with --organism")
+        sys.exit(1)
+    counts = count_genes(args.organism, args.biotype)
+    if not counts:
+        print(f"❌ No data found for organism '{args.organism}'")
+        sys.exit(1)
+    if args.biotype:
+        count = counts.get(args.biotype, 0)
+        print(f"📊 {args.organism} {args.biotype}: {count:,} genes")
+    else:
+        print(f"📊 Gene counts for {args.organism}:")
+        print("-" * 30)
+        total = 0
+        for biotype, count in sorted(counts.items()):
+            print(f"{biotype}: {count:,} genes")
+            total += count
+        print("-" * 30)
+        print(f"Total: {total:,} genes")
+def cmd_list_genes(args):
+    """List genes for an organism/biotype"""
+    if not args.organism or not args.biotype:
+        print("❌ Please specify both --organism and --biotype")
+        sys.exit(1)
+    genes = get_gene_list(args.organism, args.biotype, limit=args.limit)
+    if not genes:
+        print(f"❌ No genes found for {args.organism} {args.biotype}")
+        sys.exit(1)
+    print(f"📋 {args.organism} {args.biotype} genes ({len(genes)} shown):")
+    print("-" * 50)
+    for i, gene in enumerate(genes, 1):
+        print(f"{i:4d}. {gene}")
+    if args.limit and len(genes) == args.limit:
+        total_count = count_genes(args.organism, args.biotype)
+        total = total_count.get(args.biotype, 0)
+        print(f"\n(Showing first {args.limit} of {total:,} total genes)")
+def cmd_search(args):
+    """Search for genes by name pattern"""
+    if not args.organism or not args.query:
+        print("❌ Please specify both --organism and --query")
+        sys.exit(1)
+    results = search_genes(
+        organism=args.organism,
+        query=args.query,
+        biotype=args.biotype,
+        limit=args.limit
+    )
+    if not results:
+        print(f"❌ No genes found matching '{args.query}' in {args.organism}")
+        sys.exit(1)
+    print(f"🔍 Search results for '{args.query}' in {args.organism}:")
+    print("-" * 50)
+    for i, result in enumerate(results, 1):
+        print(f"{i:2d}. {result['gene_name']} ({result['biotype']})")
+    if len(results) == args.limit:
+        print(f"\n(Showing first {args.limit} results)")
+def cmd_info(args):
+    """Show detailed information about an organism"""
+    if not args.organism:
+        print("❌ Please specify an organism with --organism")
+        sys.exit(1)
+    info = get_organism_info(args.organism)
+    if "error" in info:
+        print(f"❌ {info['error']}")
+        sys.exit(1)
+    print(f"ℹ️  Detailed information for {args.organism}:")
+    print("=" * 40)
+    # Data availability
+    data_avail = info.get("data_available", {})
+    if "gene_counts" in data_avail:
+        print("📊 Gene Data:")
+        total_genes = 0
+        for biotype, count in sorted(data_avail["gene_counts"].items()):
+            print(f"  {biotype}: {count:,} genes")
+            total_genes += count
+        print(f"  Total: {total_genes:,} genes")
+        print()
+    if "chromosomes" in data_avail:
+        chroms = data_avail["chromosomes"]
+        print(f"🧬 Chromosome Data: {len(chroms)} chromosomes")
+        print(f"  Available: {', '.join(sorted(chroms))}")
+        print()
+    print("📁 Data Paths:")
+    for path_name, path_value in info["paths"].items():
+        from pathlib import Path
+        exists = "✅" if Path(path_value).exists() else "❌"
+        print(f"  {path_name}: {exists} {path_value}")
+def main():
+    """Main CLI entry point"""
+    parser = argparse.ArgumentParser(
+        prog="seqmat",
+        description="SeqMat genomics data management CLI"
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Commands")
+    # Setup command
+    setup_parser = subparsers.add_parser("setup", help="Set up genomics data")
+    setup_parser.add_argument("--path", required=True, help="Base path for data storage")
+    # Get available organisms dynamically
+    available_organisms = get_available_organisms()
+    default_organism = get_default_organism()
+    setup_parser.add_argument("--organism", default=default_organism, choices=available_organisms,
+                             help=f"Organism to set up (default: {default_organism})")
+    setup_parser.add_argument("--force", action="store_true", help="Force overwrite existing data")
+    setup_parser.set_defaults(func=cmd_setup)
+    # List organisms command
+    organisms_parser = subparsers.add_parser("organisms", help="List supported/configured organisms")
+    organisms_parser.set_defaults(func=cmd_list_organisms)
+    # Summary command
+    summary_parser = subparsers.add_parser("summary", help="Show data summary")
+    summary_parser.set_defaults(func=cmd_summary)
+    # Biotypes command
+    biotypes_parser = subparsers.add_parser("biotypes", help="List gene biotypes")
+    biotypes_parser.add_argument("--organism", help="Organism to query")
+    biotypes_parser.set_defaults(func=cmd_biotypes)
+    # Count command
+    count_parser = subparsers.add_parser("count", help="Count genes")
+    count_parser.add_argument("--organism", help="Organism to query")
+    count_parser.add_argument("--biotype", help="Specific biotype to count")
+    count_parser.set_defaults(func=cmd_count)
+    # List genes command
+    list_parser = subparsers.add_parser("list", help="List genes")
+    list_parser.add_argument("--organism", help="Organism to query")
+    list_parser.add_argument("--biotype", help="Gene biotype")
+    list_parser.add_argument("--limit", type=int, default=50, help="Maximum genes to show")
+    list_parser.set_defaults(func=cmd_list_genes)
+    # Search command
+    search_parser = subparsers.add_parser("search", help="Search genes by name")
+    search_parser.add_argument("--organism", help="Organism to search")
+    search_parser.add_argument("--query", help="Gene name pattern to search")
+    search_parser.add_argument("--biotype", help="Filter by biotype")
+    search_parser.add_argument("--limit", type=int, default=20, help="Maximum results")
+    search_parser.set_defaults(func=cmd_search)
+    # Info command
+    info_parser = subparsers.add_parser("info", help="Show organism information")
+    info_parser.add_argument("--organism", help="Organism to query")
+    info_parser.set_defaults(func=cmd_info)
+    # Parse arguments
+    args = parser.parse_args()
+    if not args.command:
+        parser.print_help()
+        sys.exit(1)
+    # Execute command
+    args.func(args)
+if __name__ == "__main__":
+    main()

seqmat/config.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""Configuration management for SeqMat"""
+import os
+import json
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+DEFAULT_CONFIG_DIR = Path.home() / '.seqmat'
+CONFIG_FILE = DEFAULT_CONFIG_DIR / 'config.json'
+# Default organism data sources - can be overridden in config
+DEFAULT_ORGANISM_DATA = {
+    'hg38': {
+        'name': 'Homo sapiens (Human)',
+        'urls': {
+            'fasta': 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.fa.gz',
+            'gtf': 'https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz',
+            'conservation': 'https://genome-data-public-access.s3.eu-north-1.amazonaws.com/conservation.pkl',
+            'gtex': 'https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz'
+        }
+    },
+    'mm39': {
+        'name': 'Mus musculus (Mouse)',
+        'urls': {
+            'fasta': 'https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz',
+            'gtf': 'https://ftp.ensembl.org/pub/release-112/gtf/mus_musculus/Mus_musculus.GRCm39.112.gtf.gz'
+        }
+    }
+}
+DEFAULT_SETTINGS = {
+    'default_organism': 'hg38',
+    'directory_structure': {
+        'chromosomes': 'chromosomes',
+        'annotations': 'annotations'
+    }
+}
+def load_config() -> Dict[str, Any]:
+    """Load configuration from user's home directory"""
+    if CONFIG_FILE.exists():
+        with open(CONFIG_FILE, 'r') as f:
+            config = json.load(f)
+            # Merge with default settings
+            merged_config = DEFAULT_SETTINGS.copy()
+            merged_config.update(config)
+            return merged_config
+    return DEFAULT_SETTINGS.copy()
+def save_config(config: Dict[str, Any]) -> None:
+    """Save configuration to user's home directory"""
+    DEFAULT_CONFIG_DIR.mkdir(exist_ok=True)
+    with open(CONFIG_FILE, 'w') as f:
+        json.dump(config, f, indent=2)
+def get_default_organism() -> str:
+    """Get the default organism from config or fallback"""
+    config = load_config()
+    return config.get('default_organism', DEFAULT_SETTINGS['default_organism'])
+def get_available_organisms() -> List[str]:
+    """Get list of available organisms from config and defaults"""
+    config = load_config()
+    configured_organisms = set(config.keys()) - {'default_organism', 'directory_structure'}
+    default_organisms = set(DEFAULT_ORGANISM_DATA.keys())
+    return sorted(configured_organisms | default_organisms)
+def get_organism_info(organism: str) -> Dict[str, Any]:
+    """Get organism information including name and URLs"""
+    config = load_config()
+    if organism in config and isinstance(config[organism], dict):
+        org_config = config[organism]
+        # Merge with defaults if available
+        if organism in DEFAULT_ORGANISM_DATA:
+            default_data = DEFAULT_ORGANISM_DATA[organism].copy()
+            default_data.update(org_config)
+            return default_data
+        return org_config
+    elif organism in DEFAULT_ORGANISM_DATA:
+        return DEFAULT_ORGANISM_DATA[organism]
+    else:
+        raise ValueError(f"Organism '{organism}' not configured. Available: {get_available_organisms()}")
+def get_organism_config(organism: Optional[str] = None) -> Dict[str, Path]:
+    """Get configuration paths for a specific organism"""
+    if organism is None:
+        organism = get_default_organism()
+    config = load_config()
+    if organism not in config:
+        raise ValueError(f"Organism '{organism}' not configured. Run setup_genomics_data() first.")
+    # Convert string paths to Path objects
+    org_config = config[organism]
+    return {k: Path(v) for k, v in org_config.items() if isinstance(v, str)}
+def get_directory_config() -> Dict[str, str]:
+    """Get directory structure configuration"""
+    config = load_config()
+    return config.get('directory_structure', DEFAULT_SETTINGS['directory_structure'])

seqmat/gene.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""Gene class for representing genomic genes with associated transcripts"""
+import copy
+from typing import Any, Dict, List, Tuple, Optional, Iterator, Union
+from collections import Counter
+from pathlib import Path
+from .config import get_organism_config, get_default_organism
+from .utils import unload_pickle
+from .transcript import Transcript
+class Gene:
+    """
+    A class representing a Gene, with associated transcripts and metadata.
+    Attributes:
+        organism (str): The organism build (e.g. 'hg38').
+        transcripts (dict): A dictionary of transcript annotations keyed by transcript ID.
+        gene_name (str): The name of the gene.
+        gene_id (str): The unique identifier for the gene.
+        chrm (str): The chromosome on which the gene resides.
+        rev (bool): Whether the gene is on the reverse strand.
+    """
+    def __init__(self, gene_name: str, gene_id: str, rev: bool, chrm: str,
+                 transcripts: Optional[Dict[str, Any]] = None, organism: Optional[str] = None):
+        """
+        Initialize a Gene instance.
+        Args:
+            gene_name: Name of the gene
+            gene_id: Unique identifier for the gene
+            rev: Whether gene is on reverse strand
+            chrm: Chromosome identifier
+            transcripts: Dictionary of transcript annotations
+            organism: Organism reference build (default from config)
+        """
+        self.gene_name = gene_name
+        self.gene_id = gene_id
+        self.rev = rev
+        self.chrm = chrm
+        self.organism = organism if organism is not None else get_default_organism()
+        self.transcripts = transcripts if transcripts is not None else {}
+    def __repr__(self) -> str:
+        """Official string representation of the Gene object."""
+        return f"Gene({self.gene_name})"
+    def __str__(self) -> str:
+        """User-friendly string representation of the Gene object."""
+        return f"Gene: {self.gene_name}, ID: {self.gene_id}, Chr: {self.chrm}, Transcripts: {len(self.transcripts)}"
+    def __len__(self) -> int:
+        """Returns the number of transcripts associated with this gene."""
+        return len(self.transcripts)
+    def __copy__(self):
+        """Returns a shallow copy of the Gene object."""
+        return copy.copy(self)
+    def __deepcopy__(self, memo):
+        """Returns a deep copy of the Gene object."""
+        return copy.deepcopy(self, memo)
+    def __iter__(self) -> Iterator[Transcript]:
+        """Allow iteration over the gene's transcripts, yielding Transcript objects."""
+        for tid, annotations in self.transcripts.items():
+            yield Transcript(annotations, organism=self.organism)
+    def __getitem__(self, item: str) -> Optional[Transcript]:
+        """Get a transcript by ID."""
+        if item not in self.transcripts:
+            print(f"{item} not an annotated transcript of this gene.")
+            return None
+        return Transcript(self.transcripts[item], organism=self.organism)
+    @classmethod
+    def from_file(cls, gene_name: str, organism: Optional[str] = None) -> Optional['Gene']:
+        """
+        Load gene data from file.
+        Args:
+            gene_name: Name of the gene to load
+            organism: Organism reference build
+        Returns:
+            Gene object or None if not found
+        """
+        if organism is None:
+            organism = get_default_organism()
+        try:
+            config = get_organism_config(organism)
+        except ValueError:
+            print(f"Organism '{organism}' not configured. Run setup_genomics_data() first.")
+            return None
+        # Find gene data files in the configured organism MRNA path
+        gene_files = list((config['MRNA_PATH'] / 'protein_coding').glob(f'*_{gene_name}.pkl'))
+        if not gene_files:
+            print(f"No files available for gene '{gene_name}'.")
+            return None
+        # Load gene data from the first matching file
+        data = unload_pickle(gene_files[0])
+        return cls(
+            gene_name=data.get('gene_name'),
+            gene_id=data.get('gene_id'),
+            rev=data.get('rev'),
+            chrm=data.get('chrm'),
+            transcripts=data.get('transcripts', {}),
+            organism=organism
+        )
+    def splice_sites(self) -> Tuple[Counter, Counter]:
+        """
+        Aggregates splice sites (acceptors and donors) from all transcripts.
+        Returns:
+            tuple(Counter, Counter): A tuple of two Counters for acceptors and donors.
+        """
+        acceptors: List[Any] = []
+        donors: List[Any] = []
+        # Collect acceptor and donor sites from each transcript
+        for transcript in self.transcripts.values():
+            acceptors.extend(transcript.get('acceptors', []))
+            donors.extend(transcript.get('donors', []))
+        return Counter(acceptors), Counter(donors)
+    def transcript(self, tid: Optional[str] = None) -> Optional[Transcript]:
+        """
+        Retrieve a Transcript object by ID, or the primary transcript if no ID is given.
+        Args:
+            tid: Transcript ID. If None, returns primary transcript.
+        Returns:
+            The Transcript object with the given ID or the primary transcript.
+        """
+        if tid is None:
+            tid = self.primary_transcript
+        if tid is None or tid not in self.transcripts:
+            return None
+        return Transcript(self.transcripts[tid], organism=self.organism)
+    @property
+    def primary_transcript(self) -> Optional[str]:
+        """
+        Returns the primary transcript ID for this gene.
+        Returns:
+            The primary transcript ID or None if not available.
+        """
+        # If already calculated, return it
+        if hasattr(self, '_primary_transcript'):
+            return self._primary_transcript
+        # Try to find a primary transcript
+        primary_transcripts = [k for k, v in self.transcripts.items()
+                             if v.get('primary_transcript')]
+        if primary_transcripts:
+            self._primary_transcript = primary_transcripts[0]
+            return self._primary_transcript
+        # Fallback: find a protein-coding transcript
+        protein_coding = [k for k, v in self.transcripts.items()
+                        if v.get('transcript_biotype') == 'protein_coding']
+        if protein_coding:
+            self._primary_transcript = protein_coding[0]
+            return self._primary_transcript
+        # No primary or protein-coding transcript found
+        self._primary_transcript = None
+        return None