cool-seq-tool 0.14.1__tar.gz → 0.14.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/.gitignore +3 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/PKG-INFO +5 -7
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/source/install.rst +1 -1
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/source/reference/index.rst +1 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/pyproject.toml +7 -6
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/mappers/__init__.py +8 -1
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/mappers/alignment.py +1 -1
- cool_seq_tool-0.14.3/src/cool_seq_tool/mappers/feature_overlap.py +252 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/mappers/mane_transcript.py +17 -10
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/resources/data_files.py +13 -1
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/resources/status.py +6 -1
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/schemas.py +36 -1
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/utils.py +1 -1
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool.egg-info/PKG-INFO +5 -7
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool.egg-info/SOURCES.txt +2 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool.egg-info/requires.txt +2 -4
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/tests/conftest.py +0 -10
- cool_seq_tool-0.14.3/tests/handlers/test_feature_overlap.py +492 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/tests/mappers/test_mane_transcript.py +5 -5
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/.coveragerc +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/.github/ISSUE_TEMPLATE/bug-report.yaml +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/.github/ISSUE_TEMPLATE/feature-request.yaml +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/.github/workflows/checks.yaml +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/.github/workflows/pr-priority-label.yaml +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/.github/workflows/release.yml +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/.github/workflows/stale.yaml +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/.pre-commit-config.yaml +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/.readthedocs.yaml +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/CITATION.cff +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/LICENSE +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/README.md +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/Makefile +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/make.bat +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/source/_static/img/biomart.png +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/source/_templates/module_summary.rst +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/source/changelog.rst +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/source/conf.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/source/contributing.rst +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/source/index.rst +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/source/license.rst +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/source/transcript_selection.rst +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/docs/source/usage.rst +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/setup.cfg +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/__init__.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/app.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/handlers/__init__.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/handlers/seqrepo_access.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/mappers/exon_genomic_coords.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/mappers/liftover.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/resources/__init__.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/resources/transcript_mapping.tsv +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/sources/__init__.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/sources/mane_transcript_mappings.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/sources/transcript_mappings.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool/sources/uta_database.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool.egg-info/dependency_links.txt +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/src/cool_seq_tool.egg-info/top_level.txt +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/tests/handlers/test_seqrepo_access.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/tests/mappers/test_alignment.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/tests/mappers/test_exon_genomic_coords.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/tests/mappers/test_liftover.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/tests/sources/test_mane_transcript_mappings.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/tests/sources/test_uta_database.py +0 -0
- {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.3}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cool_seq_tool
|
3
|
-
Version: 0.14.
|
3
|
+
Version: 0.14.3
|
4
4
|
Summary: Common Operation on Lots of Sequences Tool
|
5
5
|
Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
|
6
6
|
License: MIT License
|
@@ -38,22 +38,20 @@ Classifier: Intended Audience :: Developers
|
|
38
38
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
39
39
|
Classifier: License :: OSI Approved :: MIT License
|
40
40
|
Classifier: Programming Language :: Python :: 3
|
41
|
-
Classifier: Programming Language :: Python :: 3.10
|
42
41
|
Classifier: Programming Language :: Python :: 3.11
|
43
42
|
Classifier: Programming Language :: Python :: 3.12
|
44
|
-
|
43
|
+
Classifier: Programming Language :: Python :: 3.13
|
44
|
+
Requires-Python: >=3.11
|
45
45
|
Description-Content-Type: text/markdown
|
46
46
|
License-File: LICENSE
|
47
47
|
Requires-Dist: asyncpg
|
48
|
-
Requires-Dist: aiofiles
|
49
48
|
Requires-Dist: boto3
|
50
49
|
Requires-Dist: agct>=0.1.0-dev1
|
51
50
|
Requires-Dist: polars~=1.0
|
52
|
-
Requires-Dist: hgvs
|
53
51
|
Requires-Dist: biocommons.seqrepo
|
54
52
|
Requires-Dist: pydantic<3.0,>=2.0
|
55
53
|
Requires-Dist: ga4gh.vrs<3.0,>=2.1.3
|
56
|
-
Requires-Dist: wags-tails~=0.
|
54
|
+
Requires-Dist: wags-tails~=0.4.0
|
57
55
|
Requires-Dist: bioutils
|
58
56
|
Provides-Extra: dev
|
59
57
|
Requires-Dist: pre-commit>=4.2.0; extra == "dev"
|
@@ -64,7 +62,7 @@ Requires-Dist: ruff==0.12.1; extra == "dev"
|
|
64
62
|
Provides-Extra: tests
|
65
63
|
Requires-Dist: pytest; extra == "tests"
|
66
64
|
Requires-Dist: pytest-cov; extra == "tests"
|
67
|
-
Requires-Dist: pytest-asyncio
|
65
|
+
Requires-Dist: pytest-asyncio; extra == "tests"
|
68
66
|
Requires-Dist: mock; extra == "tests"
|
69
67
|
Provides-Extra: docs
|
70
68
|
Requires-Dist: sphinx==6.1.3; extra == "docs"
|
@@ -16,24 +16,22 @@ classifiers = [
|
|
16
16
|
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
17
17
|
"License :: OSI Approved :: MIT License",
|
18
18
|
"Programming Language :: Python :: 3",
|
19
|
-
"Programming Language :: Python :: 3.10",
|
20
19
|
"Programming Language :: Python :: 3.11",
|
21
20
|
"Programming Language :: Python :: 3.12",
|
21
|
+
"Programming Language :: Python :: 3.13",
|
22
22
|
]
|
23
|
-
requires-python = ">=3.
|
23
|
+
requires-python = ">=3.11"
|
24
24
|
description = "Common Operation on Lots of Sequences Tool"
|
25
25
|
license = {file = "LICENSE"}
|
26
26
|
dependencies = [
|
27
27
|
"asyncpg",
|
28
|
-
"aiofiles",
|
29
28
|
"boto3",
|
30
29
|
"agct >= 0.1.0-dev1",
|
31
30
|
"polars ~= 1.0",
|
32
|
-
"hgvs",
|
33
31
|
"biocommons.seqrepo",
|
34
32
|
"pydantic >=2.0,<3.0",
|
35
33
|
"ga4gh.vrs >=2.1.3,<3.0",
|
36
|
-
"wags-tails ~= 0.
|
34
|
+
"wags-tails ~= 0.4.0",
|
37
35
|
"bioutils",
|
38
36
|
]
|
39
37
|
dynamic = ["version"]
|
@@ -46,7 +44,7 @@ dev = [
|
|
46
44
|
"psycopg2-binary",
|
47
45
|
"ruff==0.12.1",
|
48
46
|
]
|
49
|
-
tests = ["pytest", "pytest-cov", "pytest-asyncio
|
47
|
+
tests = ["pytest", "pytest-cov", "pytest-asyncio", "mock"]
|
50
48
|
docs = [
|
51
49
|
"sphinx==6.1.3",
|
52
50
|
"sphinx-autodoc-typehints==1.22.0",
|
@@ -81,6 +79,9 @@ build-backend = "setuptools.build_meta"
|
|
81
79
|
"cool_seq_tool.resources" = ["transcript_mapping.tsv"]
|
82
80
|
|
83
81
|
[tool.pytest.ini_options]
|
82
|
+
asyncio_mode = "auto"
|
83
|
+
asyncio_default_fixture_loop_scope = "session"
|
84
|
+
asyncio_default_test_loop_scope = "session"
|
84
85
|
addopts = "--cov=src --cov-report term-missing"
|
85
86
|
testpaths = ["tests"]
|
86
87
|
|
@@ -4,6 +4,13 @@ from .alignment import AlignmentMapper # noqa: I001
|
|
4
4
|
from .liftover import LiftOver
|
5
5
|
from .mane_transcript import ManeTranscript
|
6
6
|
from .exon_genomic_coords import ExonGenomicCoordsMapper
|
7
|
+
from .feature_overlap import FeatureOverlap
|
7
8
|
|
8
9
|
|
9
|
-
__all__ = [
|
10
|
+
__all__ = [
|
11
|
+
"AlignmentMapper",
|
12
|
+
"ExonGenomicCoordsMapper",
|
13
|
+
"FeatureOverlap",
|
14
|
+
"LiftOver",
|
15
|
+
"ManeTranscript",
|
16
|
+
]
|
@@ -106,7 +106,7 @@ class AlignmentMapper:
|
|
106
106
|
c_end_pos: int,
|
107
107
|
cds_start: int | None = None,
|
108
108
|
coordinate_type: CoordinateType = CoordinateType.RESIDUE,
|
109
|
-
target_genome_assembly:
|
109
|
+
target_genome_assembly: Assembly = Assembly.GRCH38,
|
110
110
|
) -> tuple[dict | None, str | None]:
|
111
111
|
"""Translate cDNA representation to genomic representation
|
112
112
|
|
@@ -0,0 +1,252 @@
|
|
1
|
+
"""Module for getting feature (gene/exon) overlap"""
|
2
|
+
|
3
|
+
import re
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import polars as pl
|
7
|
+
from ga4gh.core import ga4gh_identify
|
8
|
+
from ga4gh.vrs.models import SequenceLocation, SequenceReference
|
9
|
+
|
10
|
+
from cool_seq_tool.handlers import SeqRepoAccess
|
11
|
+
from cool_seq_tool.resources.data_files import DataFile, get_data_file
|
12
|
+
from cool_seq_tool.schemas import Assembly, CdsOverlap, CoordinateType
|
13
|
+
|
14
|
+
# Pattern for chromosome
|
15
|
+
CHR_PATTERN = r"X|Y|([1-9]|1[0-9]|2[0-2])"
|
16
|
+
|
17
|
+
|
18
|
+
class FeatureOverlapError(Exception):
|
19
|
+
"""Custom exception for the Feature Overlap class"""
|
20
|
+
|
21
|
+
|
22
|
+
class FeatureOverlap:
|
23
|
+
"""The class for getting feature overlap"""
|
24
|
+
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
seqrepo_access: SeqRepoAccess,
|
28
|
+
mane_refseq_genomic_path: Path | None = None,
|
29
|
+
from_local: bool = False,
|
30
|
+
) -> None:
|
31
|
+
"""Initialize the FeatureOverlap class. Will load RefSeq data and store as df.
|
32
|
+
|
33
|
+
:param seqrepo_access: Client for accessing SeqRepo data
|
34
|
+
:param mane_refseq_genomic_path: Path to MANE RefSeq Genomic GFF data
|
35
|
+
:param from_local: if ``True``, don't check for or acquire latest version --
|
36
|
+
just provide most recent locally available file, if possible, and raise
|
37
|
+
error otherwise
|
38
|
+
"""
|
39
|
+
if not mane_refseq_genomic_path:
|
40
|
+
mane_refseq_genomic_path = get_data_file(
|
41
|
+
DataFile.MANE_REFSEQ_GENOMIC, from_local
|
42
|
+
)
|
43
|
+
self.seqrepo_access = seqrepo_access
|
44
|
+
self.mane_refseq_genomic_path = mane_refseq_genomic_path
|
45
|
+
self.df = self._load_mane_refseq_gff_data()
|
46
|
+
|
47
|
+
def _load_mane_refseq_gff_data(self) -> pl.DataFrame:
|
48
|
+
"""Load MANE RefSeq GFF data file into DataFrame.
|
49
|
+
|
50
|
+
:return: DataFrame containing MANE RefSeq Genomic GFF data for CDS. Columns
|
51
|
+
include `type`, `chromosome` (chromosome without 'chr' prefix), `cds_start`,
|
52
|
+
`cds_stop`, `info_name` (name of record), and `gene`. `cds_start` and
|
53
|
+
`cds_stop` use inter-residue coordinates.
|
54
|
+
"""
|
55
|
+
df = pl.read_csv(
|
56
|
+
self.mane_refseq_genomic_path,
|
57
|
+
separator="\t",
|
58
|
+
has_header=False,
|
59
|
+
skip_rows=9,
|
60
|
+
columns=[0, 2, 3, 4, 8],
|
61
|
+
)
|
62
|
+
df.columns = ["chromosome", "type", "cds_start", "cds_stop", "info"]
|
63
|
+
|
64
|
+
# Restrict to only feature of interest: CDS (which has gene info)
|
65
|
+
df = df.filter(pl.col("type") == "CDS")
|
66
|
+
|
67
|
+
# Get name from the info field
|
68
|
+
# Get gene from the info field
|
69
|
+
# Get chromosome names without prefix and without suffix for alternate transcripts
|
70
|
+
# Convert start and stop to ints
|
71
|
+
# Convert to inter-residue coordinates
|
72
|
+
# Only return certain columns
|
73
|
+
return df.with_columns(
|
74
|
+
(pl.col("info").str.extract(r"Name=([^;]+)", 1).alias("info_name")),
|
75
|
+
(pl.col("info").str.extract(r"gene=([^;]+)", 1).alias("gene")),
|
76
|
+
(pl.col("chromosome").str.extract(r"^chr?([^_]+)", 1).alias("chromosome")),
|
77
|
+
(pl.col("cds_start").cast(pl.Int64) - 1).alias("cds_start"),
|
78
|
+
(pl.col("cds_stop").cast(pl.Int64).alias("cds_stop")),
|
79
|
+
).select(
|
80
|
+
[
|
81
|
+
pl.col("type"),
|
82
|
+
pl.col("chromosome"),
|
83
|
+
pl.col("cds_start"),
|
84
|
+
pl.col("cds_stop"),
|
85
|
+
pl.col("info_name"),
|
86
|
+
pl.col("gene"),
|
87
|
+
]
|
88
|
+
)
|
89
|
+
|
90
|
+
def _get_chr_from_alt_ac(self, identifier: str) -> str:
|
91
|
+
"""Get chromosome given genomic identifier
|
92
|
+
|
93
|
+
:param identifier: Genomic identifier on GRCh38 assembly
|
94
|
+
:raises FeatureOverlapError: If unable to find associated GRCh38 chromosome
|
95
|
+
:return: Chromosome. 1..22, X, Y. No 'chr' prefix.
|
96
|
+
"""
|
97
|
+
aliases, error_msg = self.seqrepo_access.translate_identifier(
|
98
|
+
identifier, Assembly.GRCH38.value
|
99
|
+
)
|
100
|
+
|
101
|
+
if error_msg:
|
102
|
+
raise FeatureOverlapError(str(error_msg))
|
103
|
+
|
104
|
+
if not aliases:
|
105
|
+
error_msg = (
|
106
|
+
f"Unable to find {Assembly.GRCH38.value} aliases for: {identifier}"
|
107
|
+
)
|
108
|
+
raise FeatureOverlapError(error_msg)
|
109
|
+
|
110
|
+
assembly_chr_pattern = (
|
111
|
+
rf"^{Assembly.GRCH38.value}:(?P<chromosome>{CHR_PATTERN})$"
|
112
|
+
)
|
113
|
+
for a in aliases:
|
114
|
+
chr_match = re.match(assembly_chr_pattern, a)
|
115
|
+
if chr_match:
|
116
|
+
break
|
117
|
+
|
118
|
+
if not chr_match:
|
119
|
+
error_msg = (
|
120
|
+
f"Unable to find {Assembly.GRCH38.value} chromosome for: {identifier}"
|
121
|
+
)
|
122
|
+
raise FeatureOverlapError(error_msg)
|
123
|
+
|
124
|
+
chr_groupdict = chr_match.groupdict()
|
125
|
+
return chr_groupdict["chromosome"]
|
126
|
+
|
127
|
+
def get_grch38_mane_gene_cds_overlap(
|
128
|
+
self,
|
129
|
+
start: int,
|
130
|
+
end: int,
|
131
|
+
chromosome: str | None = None,
|
132
|
+
identifier: str | None = None,
|
133
|
+
coordinate_type: CoordinateType = CoordinateType.RESIDUE,
|
134
|
+
) -> dict[str, list[CdsOverlap]] | None:
|
135
|
+
"""Given GRCh38 genomic data, find the overlapping MANE features (gene and cds).
|
136
|
+
The genomic data is specified as a sequence location by `chromosome`, `start`,
|
137
|
+
`end`. All CDS regions with which the input sequence location has nonzero base
|
138
|
+
pair overlap will be returned.
|
139
|
+
|
140
|
+
:param start: GRCh38 start position
|
141
|
+
:param end: GRCh38 end position
|
142
|
+
:param chromosome: Chromosome. 1..22, X, or Y. If not provided, must provide
|
143
|
+
`identifier`. If both `chromosome` and `identifier` are provided,
|
144
|
+
`chromosome` will be used.
|
145
|
+
:param identifier: Genomic identifier on GRCh38 assembly. If not provided, must
|
146
|
+
provide `chromosome`. If both `chromosome` and `identifier` are provided,
|
147
|
+
`chromosome` will be used.
|
148
|
+
:param coordinate_type: Coordinate type for ``start`` and ``end``
|
149
|
+
:raise FeatureOverlapError: If missing required fields or unable to find
|
150
|
+
associated ga4gh identifier
|
151
|
+
:return: MANE feature (gene/cds) overlap data represented as a dict. The
|
152
|
+
dictionary will be keyed by genes which overlap the input sequence location.
|
153
|
+
Each gene contains a list of the overlapping CDS regions with the beginning
|
154
|
+
and end of the input sequence location's overlap with each
|
155
|
+
"""
|
156
|
+
ga4gh_seq_id = None
|
157
|
+
if chromosome:
|
158
|
+
if not re.match(f"^{CHR_PATTERN}$", chromosome):
|
159
|
+
error_msg = "`chromosome` must be 1, ..., 22, X, or Y"
|
160
|
+
raise FeatureOverlapError(error_msg)
|
161
|
+
else:
|
162
|
+
if identifier:
|
163
|
+
chromosome = self._get_chr_from_alt_ac(identifier)
|
164
|
+
if identifier.startswith("ga4gh:SQ."):
|
165
|
+
ga4gh_seq_id = identifier
|
166
|
+
else:
|
167
|
+
error_msg = "Must provide either `chromosome` or `identifier`"
|
168
|
+
raise FeatureOverlapError(error_msg)
|
169
|
+
|
170
|
+
# Convert residue to inter-residue
|
171
|
+
if coordinate_type == CoordinateType.RESIDUE:
|
172
|
+
start -= 1
|
173
|
+
|
174
|
+
# Get feature dataframe (df uses inter-residue)
|
175
|
+
feature_df = self.df.filter(
|
176
|
+
(pl.col("chromosome") == chromosome)
|
177
|
+
& (pl.col("cds_start") <= end)
|
178
|
+
& (pl.col("cds_stop") >= start)
|
179
|
+
)
|
180
|
+
|
181
|
+
if feature_df.is_empty():
|
182
|
+
return None
|
183
|
+
|
184
|
+
# Add overlap columns
|
185
|
+
feature_df = feature_df.with_columns(
|
186
|
+
[
|
187
|
+
pl.when(pl.col("cds_start") < start)
|
188
|
+
.then(start)
|
189
|
+
.otherwise(pl.col("cds_start"))
|
190
|
+
.alias("overlap_start"),
|
191
|
+
pl.when(pl.col("cds_stop") > end)
|
192
|
+
.then(end)
|
193
|
+
.otherwise(pl.col("cds_stop"))
|
194
|
+
.alias("overlap_stop"),
|
195
|
+
]
|
196
|
+
)
|
197
|
+
|
198
|
+
# Get ga4gh identifier for chromosome
|
199
|
+
if not ga4gh_seq_id:
|
200
|
+
grch38_chr = f"{Assembly.GRCH38.value}:{chromosome}"
|
201
|
+
ga4gh_aliases, error_msg = self.seqrepo_access.translate_identifier(
|
202
|
+
grch38_chr, "ga4gh"
|
203
|
+
)
|
204
|
+
|
205
|
+
# Errors should never happen but catching just in case
|
206
|
+
if error_msg:
|
207
|
+
raise FeatureOverlapError(str(error_msg))
|
208
|
+
|
209
|
+
if not ga4gh_aliases:
|
210
|
+
error_msg = f"Unable to find ga4gh identifier for: {grch38_chr}"
|
211
|
+
raise FeatureOverlapError(error_msg)
|
212
|
+
|
213
|
+
ga4gh_seq_id = ga4gh_aliases[0]
|
214
|
+
|
215
|
+
def _get_seq_loc(
|
216
|
+
start_pos: int, stop_pos: int, refget_ac: str
|
217
|
+
) -> SequenceLocation:
|
218
|
+
"""Get VRS Sequence Location
|
219
|
+
|
220
|
+
:param start_pos: Start position
|
221
|
+
:param stop_pos: Stop position
|
222
|
+
:param refget_ac: Refget Accession (SQ.)
|
223
|
+
:return: VRS Sequence Location
|
224
|
+
"""
|
225
|
+
_sl = SequenceLocation(
|
226
|
+
sequenceReference=SequenceReference(
|
227
|
+
refgetAccession=refget_ac,
|
228
|
+
),
|
229
|
+
start=start_pos,
|
230
|
+
end=stop_pos,
|
231
|
+
)
|
232
|
+
ga4gh_identify(_sl)
|
233
|
+
return _sl
|
234
|
+
|
235
|
+
resp = {}
|
236
|
+
refget_ac = ga4gh_seq_id.split("ga4gh:")[-1]
|
237
|
+
for gene, group in feature_df.group_by("gene"):
|
238
|
+
gene = gene[0]
|
239
|
+
_gene_overlap_data = [
|
240
|
+
CdsOverlap(
|
241
|
+
cds=_get_seq_loc(
|
242
|
+
cds_row["cds_start"], cds_row["cds_stop"], refget_ac
|
243
|
+
),
|
244
|
+
overlap=_get_seq_loc(
|
245
|
+
cds_row["overlap_start"], cds_row["overlap_stop"], refget_ac
|
246
|
+
),
|
247
|
+
).model_dump(by_alias=True, exclude_none=True)
|
248
|
+
for cds_row in group.iter_rows(named=True)
|
249
|
+
]
|
250
|
+
resp[gene] = _gene_overlap_data
|
251
|
+
|
252
|
+
return resp
|
@@ -55,7 +55,7 @@ class DataRepresentation(BaseModel):
|
|
55
55
|
"""Define object model for final output representation"""
|
56
56
|
|
57
57
|
gene: str | None = None
|
58
|
-
refseq: str
|
58
|
+
refseq: str | None = None
|
59
59
|
ensembl: str | None = None
|
60
60
|
pos: tuple[int, int]
|
61
61
|
strand: Strand
|
@@ -447,7 +447,7 @@ class ManeTranscript:
|
|
447
447
|
|
448
448
|
async def _g_to_c(
|
449
449
|
self,
|
450
|
-
g:
|
450
|
+
g: GenomicTxMetadata,
|
451
451
|
refseq_c_ac: str,
|
452
452
|
status: TranscriptPriority,
|
453
453
|
ensembl_c_ac: str | None = None,
|
@@ -590,16 +590,23 @@ class ManeTranscript:
|
|
590
590
|
if mane_transcript:
|
591
591
|
mane_start_pos = mane_transcript.pos[0]
|
592
592
|
mane_end_pos = mane_transcript.pos[1]
|
593
|
-
if anno == AnnotationLayer.CDNA
|
593
|
+
if anno == AnnotationLayer.CDNA and isinstance(
|
594
|
+
mane_transcript, CdnaRepresentation
|
595
|
+
):
|
594
596
|
mane_cds = mane_transcript.coding_start_site
|
595
597
|
mane_start_pos += mane_cds
|
596
598
|
mane_end_pos += mane_cds
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
599
|
+
|
600
|
+
if mane_transcript.refseq:
|
601
|
+
mane_ref, _ = self.seqrepo_access.get_reference_sequence(
|
602
|
+
mane_transcript.refseq,
|
603
|
+
start=mane_start_pos,
|
604
|
+
end=mane_end_pos if mane_start_pos != mane_end_pos else None,
|
605
|
+
coordinate_type=coordinate_type,
|
606
|
+
)
|
607
|
+
else:
|
608
|
+
mane_ref = None
|
609
|
+
|
603
610
|
if not mane_ref:
|
604
611
|
_logger.info("Unable to validate reference for MANE Transcript")
|
605
612
|
|
@@ -1330,7 +1337,7 @@ class ManeTranscript:
|
|
1330
1337
|
gene: str | None = None,
|
1331
1338
|
coordinate_type: CoordinateType = CoordinateType.RESIDUE,
|
1332
1339
|
try_longest_compatible: bool = False,
|
1333
|
-
) ->
|
1340
|
+
) -> ProteinAndCdnaRepresentation | None:
|
1334
1341
|
"""Given GRCh38 genomic representation, return protein representation.
|
1335
1342
|
|
1336
1343
|
Will try MANE Select and then MANE Plus Clinical. If neither is found and
|
@@ -6,7 +6,11 @@ from importlib import resources
|
|
6
6
|
from os import environ
|
7
7
|
from pathlib import Path
|
8
8
|
|
9
|
-
from wags_tails import
|
9
|
+
from wags_tails import (
|
10
|
+
NcbiLrgRefSeqGeneData,
|
11
|
+
NcbiManeRefSeqGenomicData,
|
12
|
+
NcbiManeSummaryData,
|
13
|
+
)
|
10
14
|
|
11
15
|
_logger = logging.getLogger(__name__)
|
12
16
|
|
@@ -16,6 +20,7 @@ class DataFile(str, Enum):
|
|
16
20
|
|
17
21
|
TRANSCRIPT_MAPPINGS = "transcript_mappings"
|
18
22
|
MANE_SUMMARY = "mane_summary"
|
23
|
+
MANE_REFSEQ_GENOMIC = "mane_refseq_genomic"
|
19
24
|
LRG_REFSEQGENE = "lrg_refseqgene"
|
20
25
|
|
21
26
|
def lower(self) -> str:
|
@@ -37,6 +42,12 @@ _resource_acquisition_params = {
|
|
37
42
|
from_local=from_local
|
38
43
|
)[0],
|
39
44
|
),
|
45
|
+
DataFile.MANE_REFSEQ_GENOMIC: (
|
46
|
+
"MANE_REFSEQ_GENOMIC_PATH",
|
47
|
+
lambda from_local: NcbiManeRefSeqGenomicData(silent=True).get_latest(
|
48
|
+
from_local=from_local
|
49
|
+
)[0],
|
50
|
+
),
|
40
51
|
DataFile.LRG_REFSEQGENE: (
|
41
52
|
"LRG_REFSEQGENE_PATH",
|
42
53
|
lambda from_local: NcbiLrgRefSeqGeneData(silent=True).get_latest(
|
@@ -53,6 +64,7 @@ def get_data_file(resource: DataFile, from_local: bool = False) -> Path:
|
|
53
64
|
|
54
65
|
* ``Resource.TRANSCRIPT_MAPPINGS`` -> ``TRANSCRIPT_MAPPINGS_PATH``
|
55
66
|
* ``Resource.MANE_SUMMARY`` -> ``MANE_SUMMARY_PATH``
|
67
|
+
* ``Resource.MANE_REFSEQ_GENOMIC`` -> ``MANE_REFSEQ_GENOMIC_PATH``
|
56
68
|
* ``Resource.LRG_REFSEQGENE`` -> ``LRG_REFSEQGENE_PATH``
|
57
69
|
|
58
70
|
Otherwise, this function falls back on default expected locations:
|
@@ -24,6 +24,7 @@ ResourceStatus = namedtuple(
|
|
24
24
|
DataFile.TRANSCRIPT_MAPPINGS.lower(),
|
25
25
|
DataFile.MANE_SUMMARY.lower(),
|
26
26
|
DataFile.LRG_REFSEQGENE.lower(),
|
27
|
+
DataFile.MANE_REFSEQ_GENOMIC.lower(),
|
27
28
|
"liftover",
|
28
29
|
),
|
29
30
|
)
|
@@ -37,6 +38,7 @@ async def check_status(
|
|
37
38
|
sr: SeqRepo | None = None,
|
38
39
|
chain_file_37_to_38: str | None = None,
|
39
40
|
chain_file_38_to_37: str | None = None,
|
41
|
+
mane_refseq_genomic_path: str | None = None,
|
40
42
|
) -> ResourceStatus:
|
41
43
|
"""Perform basic status checks on availability of required data resources.
|
42
44
|
|
@@ -62,6 +64,7 @@ async def check_status(
|
|
62
64
|
is used for ``agct``. If this is not provided, will check to see if
|
63
65
|
``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will allow
|
64
66
|
``agct`` to download a chain file from UCSC
|
67
|
+
:param mane_refseq_genomic_path: Optional path to MANE RefSeq Genomic GFF data
|
65
68
|
:return: boolean description of availability of each resource, given current
|
66
69
|
environment configurations
|
67
70
|
"""
|
@@ -69,19 +72,21 @@ async def check_status(
|
|
69
72
|
DataFile.TRANSCRIPT_MAPPINGS.lower(): transcript_file_path,
|
70
73
|
DataFile.LRG_REFSEQGENE.lower(): lrg_refseqgene_path,
|
71
74
|
DataFile.MANE_SUMMARY.lower(): mane_data_path,
|
75
|
+
DataFile.MANE_REFSEQ_GENOMIC.lower(): mane_refseq_genomic_path,
|
72
76
|
}
|
73
77
|
|
74
78
|
status = {
|
75
79
|
DataFile.TRANSCRIPT_MAPPINGS.lower(): False,
|
76
80
|
DataFile.LRG_REFSEQGENE.lower(): False,
|
77
81
|
DataFile.MANE_SUMMARY.lower(): False,
|
82
|
+
DataFile.MANE_REFSEQ_GENOMIC.lower(): False,
|
78
83
|
"liftover": False,
|
79
84
|
"uta": False,
|
80
85
|
"seqrepo": False,
|
81
86
|
}
|
82
87
|
for r in list(DataFile):
|
83
88
|
name_lower = r.lower()
|
84
|
-
declared_path = file_path_params
|
89
|
+
declared_path = file_path_params.get(name_lower)
|
85
90
|
if declared_path and declared_path.exists() and declared_path.is_file():
|
86
91
|
status[name_lower] = True
|
87
92
|
continue
|
@@ -4,6 +4,7 @@ import datetime
|
|
4
4
|
from enum import Enum, IntEnum
|
5
5
|
from typing import Literal
|
6
6
|
|
7
|
+
from ga4gh.vrs.models import SequenceLocation
|
7
8
|
from pydantic import (
|
8
9
|
BaseModel,
|
9
10
|
ConfigDict,
|
@@ -13,7 +14,7 @@ from pydantic import (
|
|
13
14
|
|
14
15
|
from cool_seq_tool import __version__
|
15
16
|
|
16
|
-
_now = str(datetime.datetime.now(tz=datetime.
|
17
|
+
_now = str(datetime.datetime.now(tz=datetime.UTC))
|
17
18
|
|
18
19
|
|
19
20
|
class AnnotationLayer(str, Enum):
|
@@ -167,3 +168,37 @@ class ServiceMeta(BaseModelForbidExtra):
|
|
167
168
|
}
|
168
169
|
}
|
169
170
|
)
|
171
|
+
|
172
|
+
|
173
|
+
class CdsOverlap(BaseModelForbidExtra):
|
174
|
+
"""Create model for representing CDS start/stop and Overlap start/stop"""
|
175
|
+
|
176
|
+
cds: SequenceLocation
|
177
|
+
overlap: SequenceLocation
|
178
|
+
|
179
|
+
model_config = ConfigDict(
|
180
|
+
json_schema_extra={
|
181
|
+
"example": {
|
182
|
+
"cds": {
|
183
|
+
"id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
|
184
|
+
"type": "SequenceLocation",
|
185
|
+
"sequenceReference": {
|
186
|
+
"refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
|
187
|
+
"type": "SequenceReference",
|
188
|
+
},
|
189
|
+
"start": 140726493,
|
190
|
+
"end": 140726516,
|
191
|
+
},
|
192
|
+
"overlap": {
|
193
|
+
"id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
|
194
|
+
"type": "SequenceLocation",
|
195
|
+
"sequenceReference": {
|
196
|
+
"refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
|
197
|
+
"type": "SequenceReference",
|
198
|
+
},
|
199
|
+
"start": 140726493,
|
200
|
+
"end": 140726516,
|
201
|
+
},
|
202
|
+
}
|
203
|
+
}
|
204
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cool_seq_tool
|
3
|
-
Version: 0.14.
|
3
|
+
Version: 0.14.3
|
4
4
|
Summary: Common Operation on Lots of Sequences Tool
|
5
5
|
Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
|
6
6
|
License: MIT License
|
@@ -38,22 +38,20 @@ Classifier: Intended Audience :: Developers
|
|
38
38
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
39
39
|
Classifier: License :: OSI Approved :: MIT License
|
40
40
|
Classifier: Programming Language :: Python :: 3
|
41
|
-
Classifier: Programming Language :: Python :: 3.10
|
42
41
|
Classifier: Programming Language :: Python :: 3.11
|
43
42
|
Classifier: Programming Language :: Python :: 3.12
|
44
|
-
|
43
|
+
Classifier: Programming Language :: Python :: 3.13
|
44
|
+
Requires-Python: >=3.11
|
45
45
|
Description-Content-Type: text/markdown
|
46
46
|
License-File: LICENSE
|
47
47
|
Requires-Dist: asyncpg
|
48
|
-
Requires-Dist: aiofiles
|
49
48
|
Requires-Dist: boto3
|
50
49
|
Requires-Dist: agct>=0.1.0-dev1
|
51
50
|
Requires-Dist: polars~=1.0
|
52
|
-
Requires-Dist: hgvs
|
53
51
|
Requires-Dist: biocommons.seqrepo
|
54
52
|
Requires-Dist: pydantic<3.0,>=2.0
|
55
53
|
Requires-Dist: ga4gh.vrs<3.0,>=2.1.3
|
56
|
-
Requires-Dist: wags-tails~=0.
|
54
|
+
Requires-Dist: wags-tails~=0.4.0
|
57
55
|
Requires-Dist: bioutils
|
58
56
|
Provides-Extra: dev
|
59
57
|
Requires-Dist: pre-commit>=4.2.0; extra == "dev"
|
@@ -64,7 +62,7 @@ Requires-Dist: ruff==0.12.1; extra == "dev"
|
|
64
62
|
Provides-Extra: tests
|
65
63
|
Requires-Dist: pytest; extra == "tests"
|
66
64
|
Requires-Dist: pytest-cov; extra == "tests"
|
67
|
-
Requires-Dist: pytest-asyncio
|
65
|
+
Requires-Dist: pytest-asyncio; extra == "tests"
|
68
66
|
Requires-Dist: mock; extra == "tests"
|
69
67
|
Provides-Extra: docs
|
70
68
|
Requires-Dist: sphinx==6.1.3; extra == "docs"
|
@@ -39,6 +39,7 @@ src/cool_seq_tool/handlers/seqrepo_access.py
|
|
39
39
|
src/cool_seq_tool/mappers/__init__.py
|
40
40
|
src/cool_seq_tool/mappers/alignment.py
|
41
41
|
src/cool_seq_tool/mappers/exon_genomic_coords.py
|
42
|
+
src/cool_seq_tool/mappers/feature_overlap.py
|
42
43
|
src/cool_seq_tool/mappers/liftover.py
|
43
44
|
src/cool_seq_tool/mappers/mane_transcript.py
|
44
45
|
src/cool_seq_tool/resources/__init__.py
|
@@ -51,6 +52,7 @@ src/cool_seq_tool/sources/transcript_mappings.py
|
|
51
52
|
src/cool_seq_tool/sources/uta_database.py
|
52
53
|
tests/conftest.py
|
53
54
|
tests/test_utils.py
|
55
|
+
tests/handlers/test_feature_overlap.py
|
54
56
|
tests/handlers/test_seqrepo_access.py
|
55
57
|
tests/mappers/test_alignment.py
|
56
58
|
tests/mappers/test_exon_genomic_coords.py
|