PyPI - cool-seq-tool - Versions diffs - 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

cool-seq-tool 0.5.1py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

cool_seq_tool/__init__.py +6 -0
cool_seq_tool/app.py +1 -2
cool_seq_tool/handlers/seqrepo_access.py +5 -5
cool_seq_tool/mappers/alignment.py +16 -16
cool_seq_tool/mappers/exon_genomic_coords.py +845 -628
cool_seq_tool/mappers/mane_transcript.py +184 -152
cool_seq_tool/schemas.py +30 -438
cool_seq_tool/sources/mane_transcript_mappings.py +35 -0
cool_seq_tool/sources/uta_database.py +149 -229
cool_seq_tool/utils.py +9 -9
{cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/METADATA +8 -8
cool_seq_tool-0.7.0.dist-info/RECORD +24 -0
{cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/WHEEL +1 -1
cool_seq_tool-0.5.1.dist-info/RECORD +0 -24
{cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/LICENSE +0 -0
{cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/top_level.txt +0 -0

cool_seq_tool/schemas.py CHANGED Viewed

@@ -9,7 +9,6 @@ from pydantic import (
     ConfigDict,
     StrictInt,
     StrictStr,
-    model_validator,
 )
 from cool_seq_tool import __version__
@@ -20,9 +19,9 @@ _now = str(datetime.datetime.now(tz=datetime.timezone.utc))
 class AnnotationLayer(str, Enum):
     """Create enum for supported annotation layers"""
-    PROTEIN: Literal["p"] = "p"
-    CDNA: Literal["c"] = "c"
-    GENOMIC: Literal["g"] = "g"
+    PROTEIN = "p"
+    CDNA = "c"
+    GENOMIC = "g"
 class Strand(IntEnum):
@@ -53,15 +52,17 @@ class TranscriptPriority(str, Enum):
     GRCH38 = "grch38"
-class ResidueMode(str, Enum):
-    """Create Enum for residue modes.
+class CoordinateType(str, Enum):
+    """Create Enum for coordinate types.
-    We typically prefer to operate in inter-residue coordinates, but users should be
+    It is preferred to operate in inter-residue coordinates, but users should be
     careful to define the coordinate mode of their data when calling ``cool-seq-tool``
     functions.
+    ``RESIDUE`` means 1-indexed, residue coordinates and ``INTER_RESIDUE`` means
+    0-indexed, inter-residue coordinates.
                       |   | C |   | T |   | G |   |
-    ZERO              |   | 0 |   | 1 |   | 2 |   |
     RESIDUE           |   | 1 |   | 2 |   | 3 |   |
     INTER_RESIDUE     | 0 |   | 1 |   | 2 |   | 3 |
@@ -77,14 +78,6 @@ class ResidueMode(str, Enum):
          -
          - G
          -
-       * - ``ZERO``
-         -
-         - 0
-         -
-         - 1
-         -
-         - 2
-         -
        * - ``RESIDUE``
          -
          - 1
@@ -107,7 +100,6 @@ class ResidueMode(str, Enum):
     `Variation Representation Schema (VRS) paper <https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/35311178/>`_ for further discussion.
     """
-    ZERO = "zero"
     RESIDUE = "residue"
     INTER_RESIDUE = "inter-residue"
@@ -116,157 +108,35 @@ class BaseModelForbidExtra(BaseModel, extra="forbid"):
     """Base Pydantic model class with extra values forbidden."""
-class GenomicRequestBody(BaseModelForbidExtra):
-    """Define constraints for genomic to transcript exon coordinates request body"""
-    chromosome: StrictStr | StrictInt
-    start: StrictInt | None = None
-    end: StrictInt | None = None
-    strand: Strand | None = None
-    transcript: StrictStr | None = None
-    gene: StrictStr | None = None
-    residue_mode: ResidueMode = ResidueMode.RESIDUE
-    @model_validator(mode="after")
-    def check_start_and_end(cls, values):
-        """Check that at least one of {``start``, ``end``} is set"""
-        start, end = values.start, values.end
-        if not start or end:
-            msg = "Must provide either `start` or `end`"
-            raise ValueError(msg)
-        return values
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "chromosome": "NC_000001.11",
-                "start": 154192135,
-                "end": None,
-                "strand": Strand.NEGATIVE,
-                "transcript": "NM_152263.3",
-                "gene": "TPM3",
-                "residue_mode": "residue",
-            }
-        }
-    )
-class TranscriptRequestBody(BaseModelForbidExtra):
-    """Define constraints for transcript exon to genomic coordinates request body"""
+class GenomicTxData(BaseModelForbidExtra):
+    """Represent aligned genomic/transcript exon data"""
-    transcript: StrictStr
-    gene: StrictStr | None = None
-    exon_start: StrictInt | None = None
-    exon_start_offset: StrictInt | None = 0
-    exon_end: StrictInt | None = None
-    exon_end_offset: StrictInt | None = 0
-    @model_validator(mode="after")
-    def check_exon_start_and_exon_end(cls, values):
-        """Check that at least one of {``exon_start``, ``exon_end``} is set"""
-        exon_start, exon_end = values.exon_start, values.exon_end
-        if not exon_start or exon_end:
-            msg = "Must provide either `exon_start` or `exon_end`"
-            raise ValueError(msg)
-        return values
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "gene": "TPM3",
-                "transcript": "NM_152263.3",
-                "exon_start": 1,
-                "exon_start_offset": 1,
-                "exon_end": None,
-                "exon_end_offset": None,
-            }
-        }
-    )
-class TranscriptExonData(BaseModelForbidExtra):
-    """Model containing transcript exon data."""
-    transcript: StrictStr
-    pos: StrictInt
-    exon: StrictInt
-    exon_offset: StrictInt = 0
-    gene: StrictStr
-    chr: StrictStr
+    gene: str
     strand: Strand
+    tx_pos_range: tuple[int, int]
+    alt_pos_range: tuple[int, int]
+    alt_aln_method: str
+    tx_exon_id: int
+    alt_exon_id: int
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "chr": "NC_000001.11",
-                "gene": "TPM3",
-                "pos": 154192135,
-                "exon": 1,
-                "exon_offset": 0,
-                "transcript": "NM_152263.3",
-                "strand": Strand.NEGATIVE,
-            }
-        }
-    )
+class GenomicTxMetadata(GenomicTxData):
+    """Store relevant metadata for genomic and transcript accessions"""
-class GenomicData(BaseModelForbidExtra):
-    """Model containing genomic and transcript exon data."""
+    tx_ac: str
+    alt_ac: str
+    coding_start_site: int = 0
+    coding_end_site: int = 0
+    alt_pos_change_range: tuple[int, int]
+    pos_change: tuple[int, int] | None
-    gene: StrictStr
-    chr: StrictStr
-    start: StrictInt | None = None  # Genomic start position
-    end: StrictInt | None = None  # Genomic end position
-    exon_start: StrictInt | None = None
-    exon_start_offset: StrictInt | None = 0
-    exon_end: StrictInt | None = None
-    exon_end_offset: StrictInt | None = 0
-    transcript: StrictStr
-    strand: Strand
-    @model_validator(mode="after")
-    def check_start_end(cls, values):
-        """Check that at least one of {``start``, ``end``} is set.
-        Check that at least one of {``exon_start``, ``exon_end``} is set.
-        If not set, set corresponding offset to ``None``
-        """
-        start = values.start
-        end = values.end
-        if not start and not end:
-            msg = "Missing values for `start` or `end`"
-            raise ValueError(msg)
-        if start:
-            if not values.exon_start:
-                msg = "Missing value `exon_start`"
-                raise ValueError(msg)
-        else:
-            values.exon_start_offset = None
-        if end:
-            if not values.exon_end:
-                msg = "Missing value `exon_end`"
-                raise ValueError(msg)
-        else:
-            values.exon_end_offset = None
-        return values
+class ManeGeneData(BaseModel, extra="forbid"):
+    """Define minimal object model for representing a MANE gene"""
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "gene": "TPM3",
-                "chr": "NC_000001.11",
-                "start": 154192135,
-                "end": None,
-                "exon_start": 1,
-                "exon_end": None,
-                "exon_start_offset": 0,
-                "exon_end_offset": None,
-                "transcript": "NM_152263.3",
-                "strand": Strand.NEGATIVE,
-            }
-        }
-    )
+    ncbi_gene_id: StrictInt
+    hgnc_id: StrictInt | None
+    symbol: StrictStr
 class ServiceMeta(BaseModelForbidExtra):
@@ -289,281 +159,3 @@ class ServiceMeta(BaseModelForbidExtra):
             }
         }
     )
-class TranscriptExonDataResponse(BaseModelForbidExtra):
-    """Response model for Transcript Exon Data"""
-    transcript_exon_data: TranscriptExonData | None = None
-    warnings: list[StrictStr] = []
-    service_meta: ServiceMeta
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "transcript_exon_data": {
-                    "chr": "NC_000001.11",
-                    "gene": "TPM3",
-                    "pos": 154192135,
-                    "exon": 1,
-                    "exon_offset": 0,
-                    "transcript": "NM_152263.3",
-                    "strand": Strand.NEGATIVE,
-                },
-                "warnings": [],
-                "service_meta": {
-                    "name": "cool_seq_tool",
-                    "version": __version__,
-                    "response_datetime": _now,
-                    "url": "https://github.com/GenomicMedLab/cool-seq-tool",
-                },
-            }
-        }
-    )
-class GenomicDataResponse(BaseModelForbidExtra):
-    """Response model for Genomic Data"""
-    genomic_data: GenomicData | None = None
-    warnings: list[StrictStr] = []
-    service_meta: ServiceMeta
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "genomic_data": {
-                    "gene": "TPM3",
-                    "chr": "NC_000001.11",
-                    "start": 154192135,
-                    "end": None,
-                    "exon_start": 1,
-                    "exon_end": None,
-                    "exon_start_offset": 0,
-                    "exon_end_offset": None,
-                    "transcript": "NM_152263.3",
-                    "strand": Strand.NEGATIVE,
-                },
-                "warnings": [],
-                "service_meta": {
-                    "name": "cool_seq_tool",
-                    "version": __version__,
-                    "response_datetime": _now,
-                    "url": "https://github.com/GenomicMedLab/cool-seq-tool",
-                },
-            }
-        }
-    )
-class MappedManeData(BaseModel):
-    """Define mapped mane data fields"""
-    gene: StrictStr
-    refseq: StrictStr
-    ensembl: StrictStr | None = None
-    strand: Strand
-    status: TranscriptPriority
-    alt_ac: StrictStr
-    assembly: Assembly
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "gene": "BRAF",
-                "refseq": "NM_001374258.1",
-                "ensembl": "ENST00000644969.2",
-                "strand": Strand.NEGATIVE,
-                "status": TranscriptPriority.MANE_PLUS_CLINICAL,
-                "alt_ac": "NC_000007.13",
-                "assembly": Assembly.GRCH37,
-            }
-        }
-    )
-class MappedManeDataService(BaseModelForbidExtra):
-    """Service model response for mapped mane data"""
-    mapped_mane_data: MappedManeData | None = None
-    warnings: list[StrictStr] = []
-    service_meta: ServiceMeta
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "mapped_mane_data": {
-                    "gene": "BRAF",
-                    "refseq": "NM_001374258.1",
-                    "ensembl": "ENST00000644969.2",
-                    "strand": Strand.NEGATIVE,
-                    "status": TranscriptPriority.MANE_PLUS_CLINICAL,
-                    "alt_ac": "NC_000007.13",
-                    "assembly": Assembly.GRCH37,
-                },
-                "warnings": [],
-                "service_meta": {
-                    "name": "cool_seq_tool",
-                    "version": __version__,
-                    "response_datetime": _now,
-                    "url": "https://github.com/GenomicMedLab/cool-seq-tool",
-                },
-            }
-        }
-    )
-class ManeData(BaseModel):
-    """Define mane data fields"""
-    gene: StrictStr | None = None
-    refseq: StrictStr | None = None
-    ensembl: StrictStr | None = None
-    pos: tuple[int, int]
-    strand: Strand
-    status: TranscriptPriority
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "gene": "BRAF",
-                "refseq": "NP_004324.2",
-                "ensembl": "ENSP00000493543.1",
-                "pos": (598, 598),
-                "strand": Strand.NEGATIVE,
-                "status": TranscriptPriority.MANE_SELECT,
-            }
-        }
-    )
-class ManeDataService(BaseModelForbidExtra):
-    """Service model response for getting mane data"""
-    mane_data: ManeData | None = None
-    warnings: list[StrictStr] = []
-    service_meta: ServiceMeta
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "mane_data": {
-                    "gene": "BRAF",
-                    "refseq": "NP_004324.2",
-                    "ensembl": "ENSP00000493543.1",
-                    "pos": (598, 598),
-                    "strand": Strand.NEGATIVE,
-                    "status": TranscriptPriority.MANE_SELECT,
-                },
-                "warnings": [],
-                "service_meta": {
-                    "name": "cool_seq_tool",
-                    "version": __version__,
-                    "response_datetime": _now,
-                    "url": "https://github.com/GenomicMedLab/cool-seq-tool",
-                },
-            }
-        }
-    )
-# ALIGNMENT MAPPER SERVICE SCHEMAS
-class CdnaRepresentation(BaseModelForbidExtra):
-    """Model response for cDNA representation"""
-    c_ac: StrictStr
-    c_start_pos: StrictInt
-    c_end_pos: StrictInt
-    cds_start: StrictInt
-    residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "c_ac": "NM_004333.6",
-                "c_start_pos": 1797,
-                "c_end_pos": 1800,
-                "cds_start": 226,
-                "residue_mode": ResidueMode.INTER_RESIDUE,
-            }
-        }
-    )
-class ToCdnaService(BaseModelForbidExtra):
-    """Service model response for protein -> cDNA"""
-    c_data: CdnaRepresentation | None = None
-    warnings: list[StrictStr] = []
-    service_meta: ServiceMeta
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "c_data": {
-                    "c_ac": "NM_004333.6",
-                    "c_start_pos": 1797,
-                    "c_end_pos": 1800,
-                    "cds_start": 226,
-                    "residue_mode": ResidueMode.INTER_RESIDUE,
-                },
-                "warnings": [],
-                "service_meta": {
-                    "name": "cool_seq_tool",
-                    "version": __version__,
-                    "response_datetime": _now,
-                    "url": "https://github.com/GenomicMedLab/cool-seq-tool",
-                },
-            }
-        }
-    )
-class GenomicRepresentation(BaseModelForbidExtra):
-    """Model response for genomic representation"""
-    g_ac: str
-    g_start_pos: int
-    g_end_pos: int
-    residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "g_ac": "NC_000007.13",
-                "g_start_pos": 140453134,
-                "g_end_pos": 140453137,
-                "residue_mode": ResidueMode.INTER_RESIDUE,
-            }
-        }
-    )
-class ToGenomicService(BaseModelForbidExtra):
-    """Service model response for cDNA -> genomic"""
-    g_data: GenomicRepresentation | None = None
-    warnings: list[StrictStr] = []
-    service_meta: ServiceMeta
-    model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "g_data": {
-                    "g_ac": "NC_000007.13",
-                    "g_start_pos": 140453134,
-                    "g_end_pos": 140453137,
-                    "residue_mode": ResidueMode.INTER_RESIDUE,
-                },
-                "warnings": [],
-                "service_meta": {
-                    "name": "cool_seq_tool",
-                    "version": __version__,
-                    "response_datetime": _now,
-                    "url": "https://github.com/GenomicMedLab/cool-seq-tool",
-                },
-            }
-        }
-    )

cool_seq_tool/sources/mane_transcript_mappings.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pathlib import Path
 import polars as pl
 from cool_seq_tool.resources.data_files import DataFile, get_data_file
+from cool_seq_tool.schemas import ManeGeneData
 _logger = logging.getLogger(__name__)
@@ -103,3 +104,37 @@ class ManeTranscriptMappings:
         mane_rows = mane_rows.sort(by="MANE_status", descending=True)
         return mane_rows.to_dicts()
+    def get_genomic_mane_genes(
+        self, ac: str, start: int, end: int
+    ) -> list[ManeGeneData]:
+        """Get MANE gene(s) for genomic location
+        :param ac: RefSeq genomic accession
+        :param start: Genomic start position. Assumes residue coordinates.
+        :param end: Genomic end position. Assumes residue coordinates.
+        :return: Unique MANE gene(s) found for a genomic location
+        """
+        mane_rows = self.df.filter(
+            (start >= pl.col("chr_start"))
+            & (end <= pl.col("chr_end"))
+            & (pl.col("GRCh38_chr") == ac)
+        ).unique(subset=["#NCBI_GeneID"])
+        if len(mane_rows) == 0:
+            return []
+        mane_rows = mane_rows.with_columns(
+            pl.col("#NCBI_GeneID")
+            .str.split_exact(":", 1)
+            .struct.field("field_1")
+            .cast(pl.Int32)
+            .alias("ncbi_gene_id"),
+            pl.col("HGNC_ID")
+            .str.split_exact(":", 1)
+            .struct.field("field_1")
+            .cast(pl.Int32)
+            .alias("hgnc_id"),
+        )
+        mane_rows = mane_rows.select(["ncbi_gene_id", "hgnc_id", "symbol"])
+        return [ManeGeneData(**mane_gene) for mane_gene in mane_rows.to_dicts()]

cool-seq-tool 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

cool-seq-tool 0.5.1py3-none-any.whl → 0.7.0py3-none-any.whl