PyPI - cool-seq-tool - Versions diffs - 0.5.1__tar.gz → 0.6.0__tar.gz - Mend

cool-seq-tool 0.5.1tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cool_seq_tool
-Version: 0.5.1
+Version: 0.6.0
 Summary: Common Operation on Lots of Sequences Tool
 Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
 License: MIT License

{cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/mappers/mane_transcript.py RENAMED Viewed

@@ -25,6 +25,7 @@ from cool_seq_tool.mappers.liftover import LiftOver
 from cool_seq_tool.schemas import (
     AnnotationLayer,
     Assembly,
+    ManeGeneData,
     ResidueMode,
     Strand,
     TranscriptPriority,
@@ -71,10 +72,10 @@ class CdnaRepresentation(DataRepresentation):
 class GenomicRepresentation(BaseModel):
     """Define object model for genomic representation"""
-    refseq: str
     pos: tuple[int, int]
-    status: TranscriptPriority
-    alt_ac: str
+    mane_genes: list[ManeGeneData] = []
+    status: Literal["grch38"] = TranscriptPriority.GRCH38.value
+    ac: str
 class ProteinAndCdnaRepresentation(BaseModel):
@@ -108,7 +109,7 @@ class ManeTranscript:
         >>> import asyncio
         >>> result = asyncio.run(mane_mapper.g_to_grch38("NC_000001.11", 100, 200))
-        >>> result["ac"]
+        >>> result.ac
         'NC_000001.11'
         See the :ref:`Usage section <async_note>` for more information.
@@ -128,7 +129,7 @@ class ManeTranscript:
         self.liftover = liftover
     @staticmethod
-    def _get_reading_frame(pos: int) -> int:
+    def get_reading_frame(pos: int) -> int:
         """Return reading frame number. Only used on c. coordinate.
         :param pos: cDNA position
@@ -531,8 +532,8 @@ class ManeTranscript:
         """
         for pos, pos_index in [(start_pos, 0), (end_pos, 1)]:
             if pos is not None:
-                og_rf = self._get_reading_frame(pos)
-                new_rf = self._get_reading_frame(transcript_data.pos[pos_index])
+                og_rf = self.get_reading_frame(pos)
+                new_rf = self.get_reading_frame(transcript_data.pos[pos_index])
                 if og_rf != new_rf:
                     _logger.warning(
@@ -618,7 +619,7 @@ class ManeTranscript:
         return True
-    def _validate_index(
+    def validate_index(
         self, ac: str, pos: tuple[int, int], coding_start_site: int
     ) -> bool:
         """Validate that positions actually exist on accession
@@ -910,7 +911,7 @@ class ManeTranscript:
                 ac = lcr_result.refseq or lcr_result.ensembl
                 pos = lcr_result.pos
-                if not self._validate_index(ac, pos, coding_start_site):
+                if not self.validate_index(ac, pos, coding_start_site):
                     _logger.warning(
                         "%s are not valid positions on %s with coding start site %s",
                         pos,
@@ -936,7 +937,7 @@ class ManeTranscript:
                 cds = lcr_result_dict[k].get("coding_start_site", 0)
                 ac = lcr_result_dict[k]["refseq"] or lcr_result_dict[k]["ensembl"]
                 pos = lcr_result_dict[k]["pos"]
-                if not self._validate_index(ac, pos, cds):
+                if not self.validate_index(ac, pos, cds):
                     valid = False
                     _logger.warning(
                         "%s are not valid positions on %s with coding start site %s",
@@ -962,7 +963,16 @@ class ManeTranscript:
         residue_mode: Literal[ResidueMode.RESIDUE]
         | Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.RESIDUE,
     ) -> DataRepresentation | CdnaRepresentation | None:
-        """Return MANE transcript.
+        """Return MANE representation
+        If ``start_annotation_layer`` is ``AnnotationLayer.PROTEIN``, will return
+            ``AnnotationLayer.PROTEIN`` representation.
+        If ``start_annotation_layer`` is ``AnnotationLayer.CDNA``, will return
+            ``AnnotationLayer.CDNA`` representation.
+        If ``start_annotation_layer`` is ``AnnotationLayer.GENOMIC`` will return
+            ``AnnotationLayer.CDNA`` representation if ``gene`` is provided and
+            ``AnnotationLayer.GENOMIC`` GRCh38 representation if ``gene`` is NOT
+            provided.
         >>> from cool_seq_tool.app import CoolSeqTool
         >>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode
@@ -983,7 +993,11 @@ class ManeTranscript:
         :param start_pos: Start position change
         :param end_pos: End position change
         :param start_annotation_layer: Starting annotation layer.
-        :param gene: HGNC gene symbol
+        :param gene: HGNC gene symbol.
+            If ``gene`` is not provided and ``start_annotation_layer`` is
+            ``AnnotationLayer.GENOMIC``, will return GRCh38 representation.
+            If ``gene`` is provided and ``start_annotation_layer`` is
+            ``AnnotationLayer.GENOMIC``, will return cDNA representation.
         :param ref: Reference at position given during input
         :param try_longest_compatible: ``True`` if should try longest compatible remaining
             if mane transcript was not compatible. ``False`` otherwise.
@@ -1093,29 +1107,56 @@ class ManeTranscript:
                 )
             return None
         if start_annotation_layer == AnnotationLayer.GENOMIC:
+            if not gene:
+                return await self.g_to_grch38(
+                    ac,
+                    start_pos,
+                    end_pos,
+                    get_mane_genes=True,
+                    residue_mode=residue_mode,
+                )
             return await self.g_to_mane_c(
-                ac, start_pos, end_pos, gene=gene, residue_mode=residue_mode
+                ac, start_pos, end_pos, gene, residue_mode=residue_mode
             )
         _logger.warning("Annotation layer not supported: %s", start_annotation_layer)
         return None
-    async def g_to_grch38(self, ac: str, start_pos: int, end_pos: int) -> dict | None:
+    async def g_to_grch38(
+        self,
+        ac: str,
+        start_pos: int,
+        end_pos: int,
+        get_mane_genes: bool = False,
+        residue_mode: ResidueMode = ResidueMode.RESIDUE,
+    ) -> GenomicRepresentation | None:
         """Return genomic coordinate on GRCh38 when not given gene context.
         :param ac: Genomic accession
         :param start_pos: Genomic start position
         :param end_pos: Genomic end position
-        :return: NC accession, start and end pos on GRCh38 assembly
+        :param get_mane_genes: ``True`` if mane genes for genomic position should be
+            included in response. ``False``, otherwise.
+        :param residue_mode: Residue mode for ``start_pos`` and ``end_pos``
+        :return: GRCh38 genomic representation (accession and start/end inter-residue
+            position)
         """
-        if end_pos is None:
-            end_pos = start_pos
+        start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
         # Checking to see what chromosome and assembly we're on
         descr = await self.uta_db.get_chr_assembly(ac)
         if not descr:
             # Already GRCh38 assembly
-            if self._validate_index(ac, (start_pos, end_pos), 0):
-                return {"ac": ac, "pos": (start_pos, end_pos)}
+            if self.validate_index(ac, (start_pos, end_pos), 0):
+                return GenomicRepresentation(
+                    ac=ac,
+                    pos=(start_pos, end_pos),
+                    mane_genes=self.mane_transcript_mappings.get_genomic_mane_genes(
+                        ac, start_pos + 1, end_pos
+                    )
+                    if get_mane_genes
+                    else [],
+                )
             return None
         chromosome, assembly = descr
         is_same_pos = start_pos == end_pos
@@ -1145,8 +1186,16 @@ class ManeTranscript:
         newest_ac = await self.uta_db.get_newest_assembly_ac(ac)
         if newest_ac:
             ac = newest_ac[0]
-            if self._validate_index(ac, (start_pos, end_pos), 0):
-                return {"ac": ac, "pos": (start_pos, end_pos)}
+            if self.validate_index(ac, (start_pos, end_pos), 0):
+                return GenomicRepresentation(
+                    ac=ac,
+                    pos=(start_pos, end_pos),
+                    mane_genes=self.mane_transcript_mappings.get_genomic_mane_genes(
+                        ac, start_pos + 1, end_pos
+                    )
+                    if get_mane_genes
+                    else [],
+                )
         return None
     @staticmethod
@@ -1176,14 +1225,11 @@ class ManeTranscript:
         ac: str,
         start_pos: int,
         end_pos: int,
-        gene: str | None = None,
+        gene: str,
         residue_mode: ResidueMode = ResidueMode.RESIDUE,
-    ) -> GenomicRepresentation | CdnaRepresentation | None:
+    ) -> CdnaRepresentation | None:
         """Return MANE Transcript on the c. coordinate.
-        If an arg for ``gene`` is provided, lifts to GRCh38, then gets MANE cDNA
-        representation.
         >>> import asyncio
         >>> from cool_seq_tool.app import CoolSeqTool
         >>> cst = CoolSeqTool()
@@ -1198,34 +1244,17 @@ class ManeTranscript:
         <TranscriptPriority.MANE_SELECT: 'mane_select'>
         >>> del cst
-        Locating a MANE transcript requires a ``gene`` symbol argument -- if none is
-        given, this method will only lift over to genomic coordinates on GRCh38.
         :param ac: Transcript accession on g. coordinate
         :param start_pos: genomic start position
         :param end_pos: genomic end position
         :param gene: HGNC gene symbol
         :param residue_mode: Starting residue mode for ``start_pos`` and ``end_pos``.
             Will always return coordinates in inter-residue.
-        :return: MANE Transcripts with cDNA change on c. coordinate if gene
-            is provided. Else, GRCh38 data
+        :return: MANE Transcripts with cDNA change on c. coordinate
         """
         start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
         residue_mode = ResidueMode.INTER_RESIDUE
-        # If gene not provided, return GRCh38
-        if not gene:
-            grch38 = await self.g_to_grch38(ac, start_pos, end_pos)
-            if not grch38:
-                return None
-            return GenomicRepresentation(
-                refseq=grch38["ac"],
-                pos=grch38["pos"],
-                status=TranscriptPriority.GRCH38,
-                alt_ac=grch38["ac"],
-            )
         if not await self.uta_db.validate_genomic_ac(ac):
             _logger.warning("Genomic accession does not exist: %s", ac)
             return None
@@ -1238,12 +1267,14 @@ class ManeTranscript:
             mane_c_ac = current_mane_data["RefSeq_nuc"]
             # Liftover to GRCh38
-            grch38 = await self.g_to_grch38(ac, start_pos, end_pos)
+            grch38 = await self.g_to_grch38(
+                ac, start_pos, end_pos, get_mane_genes=False, residue_mode=residue_mode
+            )
             mane_tx_genomic_data = None
             if grch38:
                 # GRCh38 -> MANE C
                 mane_tx_genomic_data = await self.uta_db.get_mane_c_genomic_data(
-                    mane_c_ac, grch38["ac"], grch38["pos"][0], grch38["pos"][1]
+                    mane_c_ac, grch38.ac, grch38.pos[0], grch38.pos[1]
                 )
             if not grch38 or not mane_tx_genomic_data:
@@ -1261,9 +1292,7 @@ class ManeTranscript:
                 mane_tx_genomic_data, coding_start_site
             )
-            if not self._validate_index(
-                mane_c_ac, mane_c_pos_change, coding_start_site
-            ):
+            if not self.validate_index(mane_c_ac, mane_c_pos_change, coding_start_site):
                 _logger.warning(
                     "%s are not valid positions on %s with coding start site %s",
                     mane_c_pos_change,
@@ -1284,7 +1313,7 @@ class ManeTranscript:
                 ),
                 refseq_c_ac=current_mane_data["RefSeq_nuc"],
                 ensembl_c_ac=current_mane_data["Ensembl_nuc"],
-                alt_ac=grch38["ac"] if grch38 else None,
+                alt_ac=grch38.ac if grch38 else None,
             )
         return None
@@ -1351,9 +1380,7 @@ class ManeTranscript:
             )
             # Validate MANE C positions
-            if not self._validate_index(
-                mane_c_ac, mane_c_pos_change, coding_start_site
-            ):
+            if not self.validate_index(mane_c_ac, mane_c_pos_change, coding_start_site):
                 _logger.warning(
                     "%s are not valid positions on %s with coding start site %s",
                     mane_c_pos_change,

cool_seq_tool-0.6.0/src/cool_seq_tool/schemas.py ADDED Viewed

@@ -0,0 +1,296 @@
+"""Defines attribute constants, useful object structures, and API response schemas."""
+import datetime
+from enum import Enum, IntEnum
+from typing import Literal
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    StrictInt,
+    StrictStr,
+    model_validator,
+)
+from cool_seq_tool import __version__
+_now = str(datetime.datetime.now(tz=datetime.timezone.utc))
+class AnnotationLayer(str, Enum):
+    """Create enum for supported annotation layers"""
+    PROTEIN: Literal["p"] = "p"
+    CDNA: Literal["c"] = "c"
+    GENOMIC: Literal["g"] = "g"
+class Strand(IntEnum):
+    """Create enum for positive and negative strand"""
+    POSITIVE = 1
+    NEGATIVE = -1
+class Assembly(str, Enum):
+    """Define supported genomic assemblies. Must be defined in ascending order"""
+    GRCH37 = "GRCh37"
+    GRCH38 = "GRCh38"
+    @classmethod
+    def values(cls) -> list[str]:
+        """Return list of values in enum (ascending assembly order)"""
+        return [item.value for item in cls]
+class TranscriptPriority(str, Enum):
+    """Create Enum for Transcript Priority labels"""
+    MANE_SELECT = "mane_select"
+    MANE_PLUS_CLINICAL = "mane_plus_clinical"
+    LONGEST_COMPATIBLE_REMAINING = "longest_compatible_remaining"
+    GRCH38 = "grch38"
+class ResidueMode(str, Enum):
+    """Create Enum for residue modes.
+    We typically prefer to operate in inter-residue coordinates, but users should be
+    careful to define the coordinate mode of their data when calling ``cool-seq-tool``
+    functions.
+                      |   | C |   | T |   | G |   |
+    ZERO              |   | 0 |   | 1 |   | 2 |   |
+    RESIDUE           |   | 1 |   | 2 |   | 3 |   |
+    INTER_RESIDUE     | 0 |   | 1 |   | 2 |   | 3 |
+    .. tabularcolumns:: |L|C|C|C|C|C|C|C|
+    .. list-table::
+       :header-rows: 1
+       * -
+         -
+         - C
+         -
+         - T
+         -
+         - G
+         -
+       * - ``ZERO``
+         -
+         - 0
+         -
+         - 1
+         -
+         - 2
+         -
+       * - ``RESIDUE``
+         -
+         - 1
+         -
+         - 2
+         -
+         - 3
+         -
+       * - ``INTER_RESIDUE``
+         - 0
+         -
+         - 1
+         -
+         - 2
+         -
+         - 3
+    See "Conventions that promote reliable data sharing" and figure 3 within the
+    `Variation Representation Schema (VRS) paper <https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/35311178/>`_ for further discussion.
+    """
+    ZERO = "zero"
+    RESIDUE = "residue"
+    INTER_RESIDUE = "inter-residue"
+class BaseModelForbidExtra(BaseModel, extra="forbid"):
+    """Base Pydantic model class with extra values forbidden."""
+class ManeGeneData(BaseModel, extra="forbid"):
+    """Define minimal object model for representing a MANE gene"""
+    ncbi_gene_id: StrictInt
+    hgnc_id: StrictInt | None
+    symbol: StrictStr
+class TranscriptExonData(BaseModelForbidExtra):
+    """Model containing transcript exon data."""
+    transcript: StrictStr
+    pos: StrictInt
+    exon: StrictInt
+    exon_offset: StrictInt = 0
+    gene: StrictStr
+    chr: StrictStr
+    strand: Strand
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "chr": "NC_000001.11",
+                "gene": "TPM3",
+                "pos": 154192135,
+                "exon": 1,
+                "exon_offset": 0,
+                "transcript": "NM_152263.3",
+                "strand": Strand.NEGATIVE,
+            }
+        }
+    )
+class GenomicData(BaseModelForbidExtra):
+    """Model containing genomic and transcript exon data."""
+    gene: StrictStr
+    chr: StrictStr
+    start: StrictInt | None = None  # Genomic start position
+    end: StrictInt | None = None  # Genomic end position
+    exon_start: StrictInt | None = None
+    exon_start_offset: StrictInt | None = 0
+    exon_end: StrictInt | None = None
+    exon_end_offset: StrictInt | None = 0
+    transcript: StrictStr
+    strand: Strand
+    @model_validator(mode="after")
+    def check_start_end(cls, values):
+        """Check that at least one of {``start``, ``end``} is set.
+        Check that at least one of {``exon_start``, ``exon_end``} is set.
+        If not set, set corresponding offset to ``None``
+        """
+        start = values.start
+        end = values.end
+        if not start and not end:
+            msg = "Missing values for `start` or `end`"
+            raise ValueError(msg)
+        if start:
+            if not values.exon_start:
+                msg = "Missing value `exon_start`"
+                raise ValueError(msg)
+        else:
+            values.exon_start_offset = None
+        if end:
+            if not values.exon_end:
+                msg = "Missing value `exon_end`"
+                raise ValueError(msg)
+        else:
+            values.exon_end_offset = None
+        return values
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "gene": "TPM3",
+                "chr": "NC_000001.11",
+                "start": 154192135,
+                "end": None,
+                "exon_start": 1,
+                "exon_end": None,
+                "exon_start_offset": 0,
+                "exon_end_offset": None,
+                "transcript": "NM_152263.3",
+                "strand": Strand.NEGATIVE,
+            }
+        }
+    )
+class ServiceMeta(BaseModelForbidExtra):
+    """Metadata for cool_seq_tool service"""
+    name: Literal["cool_seq_tool"] = "cool_seq_tool"
+    version: StrictStr
+    response_datetime: datetime.datetime
+    url: Literal["https://github.com/GenomicMedLab/cool-seq-tool"] = (
+        "https://github.com/GenomicMedLab/cool-seq-tool"
+    )
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "name": "cool_seq_tool",
+                "version": __version__,
+                "response_datetime": _now,
+                "url": "https://github.com/GenomicMedLab/cool-seq-tool",
+            }
+        }
+    )
+class TranscriptExonDataResponse(BaseModelForbidExtra):
+    """Response model for Transcript Exon Data"""
+    transcript_exon_data: TranscriptExonData | None = None
+    warnings: list[StrictStr] = []
+    service_meta: ServiceMeta
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "transcript_exon_data": {
+                    "chr": "NC_000001.11",
+                    "gene": "TPM3",
+                    "pos": 154192135,
+                    "exon": 1,
+                    "exon_offset": 0,
+                    "transcript": "NM_152263.3",
+                    "strand": Strand.NEGATIVE,
+                },
+                "warnings": [],
+                "service_meta": {
+                    "name": "cool_seq_tool",
+                    "version": __version__,
+                    "response_datetime": _now,
+                    "url": "https://github.com/GenomicMedLab/cool-seq-tool",
+                },
+            }
+        }
+    )
+class GenomicDataResponse(BaseModelForbidExtra):
+    """Response model for Genomic Data"""
+    genomic_data: GenomicData | None = None
+    warnings: list[StrictStr] = []
+    service_meta: ServiceMeta
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "genomic_data": {
+                    "gene": "TPM3",
+                    "chr": "NC_000001.11",
+                    "start": 154192135,
+                    "end": None,
+                    "exon_start": 1,
+                    "exon_end": None,
+                    "exon_start_offset": 0,
+                    "exon_end_offset": None,
+                    "transcript": "NM_152263.3",
+                    "strand": Strand.NEGATIVE,
+                },
+                "warnings": [],
+                "service_meta": {
+                    "name": "cool_seq_tool",
+                    "version": __version__,
+                    "response_datetime": _now,
+                    "url": "https://github.com/GenomicMedLab/cool-seq-tool",
+                },
+            }
+        }
+    )

{cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/sources/mane_transcript_mappings.py RENAMED Viewed

@@ -8,6 +8,7 @@ from pathlib import Path
 import polars as pl
 from cool_seq_tool.resources.data_files import DataFile, get_data_file
+from cool_seq_tool.schemas import ManeGeneData
 _logger = logging.getLogger(__name__)
@@ -103,3 +104,37 @@ class ManeTranscriptMappings:
         mane_rows = mane_rows.sort(by="MANE_status", descending=True)
         return mane_rows.to_dicts()
+    def get_genomic_mane_genes(
+        self, ac: str, start: int, end: int
+    ) -> list[ManeGeneData]:
+        """Get MANE gene(s) for genomic location
+        :param ac: RefSeq genomic accession
+        :param start: Genomic start position. Assumes residue coordinates.
+        :param end: Genomic end position. Assumes residue coordinates.
+        :return: Unique MANE gene(s) found for a genomic location
+        """
+        mane_rows = self.df.filter(
+            (start >= pl.col("chr_start"))
+            & (end <= pl.col("chr_end"))
+            & (pl.col("GRCh38_chr") == ac)
+        ).unique(subset=["#NCBI_GeneID"])
+        if len(mane_rows) == 0:
+            return []
+        mane_rows = mane_rows.with_columns(
+            pl.col("#NCBI_GeneID")
+            .str.split_exact(":", 1)
+            .struct.field("field_1")
+            .cast(pl.Int32)
+            .alias("ncbi_gene_id"),
+            pl.col("HGNC_ID")
+            .str.split_exact(":", 1)
+            .struct.field("field_1")
+            .cast(pl.Int32)
+            .alias("hgnc_id"),
+        )
+        mane_rows = mane_rows.select(["ncbi_gene_id", "hgnc_id", "symbol"])
+        return [ManeGeneData(**mane_gene) for mane_gene in mane_rows.to_dicts()]

{cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cool_seq_tool
-Version: 0.5.1
+Version: 0.6.0
 Summary: Common Operation on Lots of Sequences Tool
 Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
 License: MIT License

{cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/tests/conftest.py RENAMED Viewed

@@ -5,7 +5,7 @@ import asyncio
 import pytest
 from cool_seq_tool.app import CoolSeqTool
-from cool_seq_tool.schemas import Strand
+from cool_seq_tool.schemas import ManeGeneData, Strand
 @pytest.fixture(scope="session")
@@ -121,3 +121,15 @@ def genomic_tx_data():
         "tx_ac": "NM_004333.4",
         "alt_ac": "NC_000007.13",
     }
+@pytest.fixture(scope="session")
+def egfr_mane_gene():
+    """Create test fixture for EGFR MANE gene"""
+    return ManeGeneData(ncbi_gene_id=1956, hgnc_id=3236, symbol="EGFR")
+@pytest.fixture(scope="session")
+def braf_mane_gene():
+    """Create test fixture for BRAF MANE gene"""
+    return ManeGeneData(ncbi_gene_id=673, hgnc_id=1097, symbol="BRAF")

cool-seq-tool 0.5.1__tar.gz → 0.6.0__tar.gz

cool-seq-tool 0.5.1tar.gz → 0.6.0tar.gz