cool-seq-tool 0.14.1__tar.gz → 0.14.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/PKG-INFO +3 -5
  2. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/source/reference/index.rst +1 -0
  3. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/pyproject.toml +5 -4
  4. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/mappers/__init__.py +8 -1
  5. cool_seq_tool-0.14.2/src/cool_seq_tool/mappers/feature_overlap.py +251 -0
  6. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/resources/data_files.py +13 -1
  7. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/schemas.py +35 -0
  8. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool.egg-info/PKG-INFO +3 -5
  9. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool.egg-info/SOURCES.txt +2 -0
  10. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool.egg-info/requires.txt +2 -4
  11. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/tests/conftest.py +0 -10
  12. cool_seq_tool-0.14.2/tests/handlers/test_feature_overlap.py +492 -0
  13. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/tests/mappers/test_mane_transcript.py +5 -5
  14. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/.coveragerc +0 -0
  15. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/.github/ISSUE_TEMPLATE/bug-report.yaml +0 -0
  16. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/.github/ISSUE_TEMPLATE/feature-request.yaml +0 -0
  17. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/.github/workflows/checks.yaml +0 -0
  18. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/.github/workflows/pr-priority-label.yaml +0 -0
  19. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/.github/workflows/release.yml +0 -0
  20. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/.github/workflows/stale.yaml +0 -0
  21. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/.gitignore +0 -0
  22. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/.pre-commit-config.yaml +0 -0
  23. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/.readthedocs.yaml +0 -0
  24. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/CITATION.cff +0 -0
  25. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/LICENSE +0 -0
  26. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/README.md +0 -0
  27. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/Makefile +0 -0
  28. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/make.bat +0 -0
  29. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/source/_static/img/biomart.png +0 -0
  30. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/source/_templates/module_summary.rst +0 -0
  31. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/source/changelog.rst +0 -0
  32. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/source/conf.py +0 -0
  33. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/source/contributing.rst +0 -0
  34. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/source/index.rst +0 -0
  35. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/source/install.rst +0 -0
  36. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/source/license.rst +0 -0
  37. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/source/transcript_selection.rst +0 -0
  38. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/docs/source/usage.rst +0 -0
  39. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/setup.cfg +0 -0
  40. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/__init__.py +0 -0
  41. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/app.py +0 -0
  42. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/handlers/__init__.py +0 -0
  43. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/handlers/seqrepo_access.py +0 -0
  44. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/mappers/alignment.py +0 -0
  45. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/mappers/exon_genomic_coords.py +0 -0
  46. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/mappers/liftover.py +0 -0
  47. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/mappers/mane_transcript.py +0 -0
  48. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/resources/__init__.py +0 -0
  49. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/resources/status.py +0 -0
  50. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/resources/transcript_mapping.tsv +0 -0
  51. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/sources/__init__.py +0 -0
  52. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/sources/mane_transcript_mappings.py +0 -0
  53. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/sources/transcript_mappings.py +0 -0
  54. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/sources/uta_database.py +0 -0
  55. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool/utils.py +0 -0
  56. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool.egg-info/dependency_links.txt +0 -0
  57. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/src/cool_seq_tool.egg-info/top_level.txt +0 -0
  58. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/tests/handlers/test_seqrepo_access.py +0 -0
  59. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/tests/mappers/test_alignment.py +0 -0
  60. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/tests/mappers/test_exon_genomic_coords.py +0 -0
  61. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/tests/mappers/test_liftover.py +0 -0
  62. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/tests/sources/test_mane_transcript_mappings.py +0 -0
  63. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/tests/sources/test_uta_database.py +0 -0
  64. {cool_seq_tool-0.14.1 → cool_seq_tool-0.14.2}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cool_seq_tool
3
- Version: 0.14.1
3
+ Version: 0.14.2
4
4
  Summary: Common Operation on Lots of Sequences Tool
5
5
  Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
6
6
  License: MIT License
@@ -45,15 +45,13 @@ Requires-Python: >=3.10
45
45
  Description-Content-Type: text/markdown
46
46
  License-File: LICENSE
47
47
  Requires-Dist: asyncpg
48
- Requires-Dist: aiofiles
49
48
  Requires-Dist: boto3
50
49
  Requires-Dist: agct>=0.1.0-dev1
51
50
  Requires-Dist: polars~=1.0
52
- Requires-Dist: hgvs
53
51
  Requires-Dist: biocommons.seqrepo
54
52
  Requires-Dist: pydantic<3.0,>=2.0
55
53
  Requires-Dist: ga4gh.vrs<3.0,>=2.1.3
56
- Requires-Dist: wags-tails~=0.3.2
54
+ Requires-Dist: wags-tails~=0.4.0
57
55
  Requires-Dist: bioutils
58
56
  Provides-Extra: dev
59
57
  Requires-Dist: pre-commit>=4.2.0; extra == "dev"
@@ -64,7 +62,7 @@ Requires-Dist: ruff==0.12.1; extra == "dev"
64
62
  Provides-Extra: tests
65
63
  Requires-Dist: pytest; extra == "tests"
66
64
  Requires-Dist: pytest-cov; extra == "tests"
67
- Requires-Dist: pytest-asyncio==0.18.3; extra == "tests"
65
+ Requires-Dist: pytest-asyncio; extra == "tests"
68
66
  Requires-Dist: mock; extra == "tests"
69
67
  Provides-Extra: docs
70
68
  Requires-Dist: sphinx==6.1.3; extra == "docs"
@@ -55,6 +55,7 @@ Data Mappers
55
55
 
56
56
  cool_seq_tool.mappers.alignment
57
57
  cool_seq_tool.mappers.exon_genomic_coords
58
+ cool_seq_tool.mappers.feature_overlap
58
59
  cool_seq_tool.mappers.liftover
59
60
  cool_seq_tool.mappers.mane_transcript
60
61
 
@@ -25,15 +25,13 @@ description = "Common Operation on Lots of Sequences Tool"
25
25
  license = {file = "LICENSE"}
26
26
  dependencies = [
27
27
  "asyncpg",
28
- "aiofiles",
29
28
  "boto3",
30
29
  "agct >= 0.1.0-dev1",
31
30
  "polars ~= 1.0",
32
- "hgvs",
33
31
  "biocommons.seqrepo",
34
32
  "pydantic >=2.0,<3.0",
35
33
  "ga4gh.vrs >=2.1.3,<3.0",
36
- "wags-tails ~= 0.3.2",
34
+ "wags-tails ~= 0.4.0",
37
35
  "bioutils",
38
36
  ]
39
37
  dynamic = ["version"]
@@ -46,7 +44,7 @@ dev = [
46
44
  "psycopg2-binary",
47
45
  "ruff==0.12.1",
48
46
  ]
49
- tests = ["pytest", "pytest-cov", "pytest-asyncio==0.18.3", "mock"]
47
+ tests = ["pytest", "pytest-cov", "pytest-asyncio", "mock"]
50
48
  docs = [
51
49
  "sphinx==6.1.3",
52
50
  "sphinx-autodoc-typehints==1.22.0",
@@ -81,6 +79,9 @@ build-backend = "setuptools.build_meta"
81
79
  "cool_seq_tool.resources" = ["transcript_mapping.tsv"]
82
80
 
83
81
  [tool.pytest.ini_options]
82
+ asyncio_mode = "auto"
83
+ asyncio_default_fixture_loop_scope = "session"
84
+ asyncio_default_test_loop_scope = "session"
84
85
  addopts = "--cov=src --cov-report term-missing"
85
86
  testpaths = ["tests"]
86
87
 
@@ -4,6 +4,13 @@ from .alignment import AlignmentMapper # noqa: I001
4
4
  from .liftover import LiftOver
5
5
  from .mane_transcript import ManeTranscript
6
6
  from .exon_genomic_coords import ExonGenomicCoordsMapper
7
+ from .feature_overlap import FeatureOverlap
7
8
 
8
9
 
9
- __all__ = ["AlignmentMapper", "ExonGenomicCoordsMapper", "LiftOver", "ManeTranscript"]
10
+ __all__ = [
11
+ "AlignmentMapper",
12
+ "ExonGenomicCoordsMapper",
13
+ "FeatureOverlap",
14
+ "LiftOver",
15
+ "ManeTranscript",
16
+ ]
@@ -0,0 +1,251 @@
1
+ """Module for getting feature (gene/exon) overlap"""
2
+
3
+ import re
4
+ from pathlib import Path
5
+
6
+ import polars as pl
7
+ from ga4gh.core import ga4gh_identify
8
+ from ga4gh.vrs.models import SequenceLocation, SequenceReference
9
+
10
+ from cool_seq_tool.handlers import SeqRepoAccess
11
+ from cool_seq_tool.resources.data_files import DataFile, get_data_file
12
+ from cool_seq_tool.schemas import Assembly, CdsOverlap, CoordinateType
13
+
14
+ # Pattern for chromosome
15
+ CHR_PATTERN = r"X|Y|([1-9]|1[0-9]|2[0-2])"
16
+
17
+
18
+ class FeatureOverlapError(Exception):
19
+ """Custom exception for the Feature Overlap class"""
20
+
21
+
22
+ class FeatureOverlap:
23
+ """The class for getting feature overlap"""
24
+
25
+ def __init__(
26
+ self,
27
+ seqrepo_access: SeqRepoAccess,
28
+ mane_refseq_genomic_path: Path | None = None,
29
+ from_local: bool = False,
30
+ ) -> None:
31
+ """Initialize the FeatureOverlap class. Will load RefSeq data and store as df.
32
+
33
+ :param seqrepo_access: Client for accessing SeqRepo data
34
+ :param mane_refseq_genomic_path: Path to MANE RefSeq Genomic GFF data
35
+ :param from_local: if ``True``, don't check for or acquire latest version --
36
+ just provide most recent locally available file, if possible, and raise
37
+ error otherwise
38
+ """
39
+ if not mane_refseq_genomic_path:
40
+ mane_refseq_genomic_path = get_data_file(
41
+ DataFile.MANE_REFSEQ_GENOMIC, from_local
42
+ )
43
+ self.seqrepo_access = seqrepo_access
44
+ self.mane_refseq_genomic_path = mane_refseq_genomic_path
45
+ self.df = self._load_mane_refseq_gff_data()
46
+
47
+ def _load_mane_refseq_gff_data(self) -> pl.DataFrame:
48
+ """Load MANE RefSeq GFF data file into DataFrame.
49
+
50
+ :return: DataFrame containing MANE RefSeq Genomic GFF data for CDS. Columns
51
+ include `type`, `chromosome` (chromosome without 'chr' prefix), `cds_start`,
52
+ `cds_stop`, `info_name` (name of record), and `gene`. `cds_start` and
53
+ `cds_stop` use inter-residue coordinates.
54
+ """
55
+ df = pl.read_csv(
56
+ self.mane_refseq_genomic_path,
57
+ separator="\t",
58
+ has_header=False,
59
+ skip_rows=9,
60
+ columns=[0, 2, 3, 4, 8],
61
+ )
62
+ df.columns = ["chromosome", "type", "cds_start", "cds_stop", "info"]
63
+
64
+ # Restrict to only feature of interest: CDS (which has gene info)
65
+ df = df.filter(pl.col("type") == "CDS")
66
+
67
+ # Get name from the info field
68
+ # Get gene from the info field
69
+ # Get chromosome names without prefix and without suffix for alternate transcripts
70
+ # Convert start and stop to ints
71
+ # Convert to inter-residue coordinates
72
+ # Only return certain columns
73
+ return df.with_columns(
74
+ (pl.col("info").str.extract(r"Name=([^;]+)", 1).alias("info_name")),
75
+ (pl.col("info").str.extract(r"gene=([^;]+)", 1).alias("gene")),
76
+ (pl.col("chromosome").str.extract(r"^chr?([^_]+)", 1).alias("chromosome")),
77
+ (pl.col("cds_start").cast(pl.Int64) - 1).alias("cds_start"),
78
+ (pl.col("cds_stop").cast(pl.Int64).alias("cds_stop")),
79
+ ).select(
80
+ [
81
+ pl.col("type"),
82
+ pl.col("chromosome"),
83
+ pl.col("cds_start"),
84
+ pl.col("cds_stop"),
85
+ pl.col("info_name"),
86
+ pl.col("gene"),
87
+ ]
88
+ )
89
+
90
+ def _get_chr_from_alt_ac(self, identifier: str) -> str:
91
+ """Get chromosome given genomic identifier
92
+
93
+ :param identifier: Genomic identifier on GRCh38 assembly
94
+ :raises FeatureOverlapError: If unable to find associated GRCh38 chromosome
95
+ :return: Chromosome. 1..22, X, Y. No 'chr' prefix.
96
+ """
97
+ aliases, error_msg = self.seqrepo_access.translate_identifier(
98
+ identifier, Assembly.GRCH38.value
99
+ )
100
+
101
+ if error_msg:
102
+ raise FeatureOverlapError(str(error_msg))
103
+
104
+ if not aliases:
105
+ error_msg = (
106
+ f"Unable to find {Assembly.GRCH38.value} aliases for: {identifier}"
107
+ )
108
+ raise FeatureOverlapError(error_msg)
109
+
110
+ assembly_chr_pattern = (
111
+ rf"^{Assembly.GRCH38.value}:(?P<chromosome>{CHR_PATTERN})$"
112
+ )
113
+ for a in aliases:
114
+ chr_match = re.match(assembly_chr_pattern, a)
115
+ if chr_match:
116
+ break
117
+
118
+ if not chr_match:
119
+ error_msg = (
120
+ f"Unable to find {Assembly.GRCH38.value} chromosome for: {identifier}"
121
+ )
122
+ raise FeatureOverlapError(error_msg)
123
+
124
+ chr_groupdict = chr_match.groupdict()
125
+ return chr_groupdict["chromosome"]
126
+
127
+ def get_grch38_mane_gene_cds_overlap(
128
+ self,
129
+ start: int,
130
+ end: int,
131
+ chromosome: str | None = None,
132
+ identifier: str | None = None,
133
+ coordinate_type: CoordinateType = CoordinateType.RESIDUE,
134
+ ) -> dict[str, list[CdsOverlap]] | None:
135
+ """Given GRCh38 genomic data, find the overlapping MANE features (gene and cds).
136
+ The genomic data is specified as a sequence location by `chromosome`, `start`,
137
+ `end`. All CDS regions with which the input sequence location has nonzero base
138
+ pair overlap will be returned.
139
+
140
+ :param start: GRCh38 start position
141
+ :param end: GRCh38 end position
142
+ :param chromosome: Chromosome. 1..22, X, or Y. If not provided, must provide
143
+ `identifier`. If both `chromosome` and `identifier` are provided,
144
+ `chromosome` will be used.
145
+ :param identifier: Genomic identifier on GRCh38 assembly. If not provided, must
146
+ provide `chromosome`. If both `chromosome` and `identifier` are provided,
147
+ `chromosome` will be used.
148
+ :param coordinate_type: Coordinate type for ``start`` and ``end``
149
+ :raise FeatureOverlapError: If missing required fields or unable to find
150
+ associated ga4gh identifier
151
+ :return: MANE feature (gene/cds) overlap data represented as a dict. The
152
+ dictionary will be keyed by genes which overlap the input sequence location.
153
+ Each gene contains a list of the overlapping CDS regions with the beginning
154
+ and end of the input sequence location's overlap with each
155
+ """
156
+ ga4gh_seq_id = None
157
+ if chromosome:
158
+ if not re.match(f"^{CHR_PATTERN}$", chromosome):
159
+ error_msg = "`chromosome` must be 1, ..., 22, X, or Y"
160
+ raise FeatureOverlapError(error_msg)
161
+ else:
162
+ if identifier:
163
+ chromosome = self._get_chr_from_alt_ac(identifier)
164
+ if identifier.startswith("ga4gh:SQ."):
165
+ ga4gh_seq_id = identifier
166
+ else:
167
+ error_msg = "Must provide either `chromosome` or `identifier`"
168
+ raise FeatureOverlapError(error_msg)
169
+
170
+ # Convert residue to inter-residue
171
+ if coordinate_type == CoordinateType.RESIDUE:
172
+ start -= 1
173
+
174
+ # Get feature dataframe (df uses inter-residue)
175
+ feature_df = self.df.filter(
176
+ (pl.col("chromosome") == chromosome)
177
+ & (pl.col("cds_start") <= end)
178
+ & (pl.col("cds_stop") >= start)
179
+ )
180
+
181
+ if feature_df.is_empty():
182
+ return None
183
+
184
+ # Add overlap columns
185
+ feature_df = feature_df.with_columns(
186
+ [
187
+ pl.when(pl.col("cds_start") < start)
188
+ .then(start)
189
+ .otherwise(pl.col("cds_start"))
190
+ .alias("overlap_start"),
191
+ pl.when(pl.col("cds_stop") > end)
192
+ .then(end)
193
+ .otherwise(pl.col("cds_stop"))
194
+ .alias("overlap_stop"),
195
+ ]
196
+ )
197
+
198
+ # Get ga4gh identifier for chromosome
199
+ if not ga4gh_seq_id:
200
+ grch38_chr = f"{Assembly.GRCH38.value}:{chromosome}"
201
+ ga4gh_aliases, error_msg = self.seqrepo_access.translate_identifier(
202
+ grch38_chr, "ga4gh"
203
+ )
204
+
205
+ # Errors should never happen but catching just in case
206
+ if error_msg:
207
+ raise FeatureOverlapError(str(error_msg))
208
+
209
+ if not ga4gh_aliases:
210
+ error_msg = f"Unable to find ga4gh identifier for: {grch38_chr}"
211
+ raise FeatureOverlapError(error_msg)
212
+
213
+ ga4gh_seq_id = ga4gh_aliases[0]
214
+
215
+ def _get_seq_loc(start_pos: int, stop_pos: int, refget_ac: str) -> dict:
216
+ """Get VRS Sequence Location represented as a dict
217
+
218
+ :param start_pos: Start position
219
+ :param stop_pos: Stop position
220
+ :param refget_ac: Refget Accession (SQ.)
221
+ :return: VRS Sequence Location represented as dictionary with the ga4gh ID
222
+ included
223
+ """
224
+ _sl = SequenceLocation(
225
+ sequenceReference=SequenceReference(
226
+ refgetAccession=refget_ac,
227
+ ),
228
+ start=start_pos,
229
+ end=stop_pos,
230
+ )
231
+ ga4gh_identify(_sl)
232
+ return _sl.model_dump(exclude_none=True)
233
+
234
+ resp = {}
235
+ refget_ac = ga4gh_seq_id.split("ga4gh:")[-1]
236
+ for gene, group in feature_df.group_by("gene"):
237
+ gene = gene[0]
238
+ _gene_overlap_data = [
239
+ CdsOverlap(
240
+ cds=_get_seq_loc(
241
+ cds_row["cds_start"], cds_row["cds_stop"], refget_ac
242
+ ),
243
+ overlap=_get_seq_loc(
244
+ cds_row["overlap_start"], cds_row["overlap_stop"], refget_ac
245
+ ),
246
+ ).model_dump(by_alias=True, exclude_none=True)
247
+ for cds_row in group.iter_rows(named=True)
248
+ ]
249
+ resp[gene] = _gene_overlap_data
250
+
251
+ return resp
@@ -6,7 +6,11 @@ from importlib import resources
6
6
  from os import environ
7
7
  from pathlib import Path
8
8
 
9
- from wags_tails import NcbiLrgRefSeqGeneData, NcbiManeSummaryData
9
+ from wags_tails import (
10
+ NcbiLrgRefSeqGeneData,
11
+ NcbiManeRefSeqGenomicData,
12
+ NcbiManeSummaryData,
13
+ )
10
14
 
11
15
  _logger = logging.getLogger(__name__)
12
16
 
@@ -16,6 +20,7 @@ class DataFile(str, Enum):
16
20
 
17
21
  TRANSCRIPT_MAPPINGS = "transcript_mappings"
18
22
  MANE_SUMMARY = "mane_summary"
23
+ MANE_REFSEQ_GENOMIC = "mane_refseq_genomic"
19
24
  LRG_REFSEQGENE = "lrg_refseqgene"
20
25
 
21
26
  def lower(self) -> str:
@@ -37,6 +42,12 @@ _resource_acquisition_params = {
37
42
  from_local=from_local
38
43
  )[0],
39
44
  ),
45
+ DataFile.MANE_REFSEQ_GENOMIC: (
46
+ "MANE_REFSEQ_GENOMIC_PATH",
47
+ lambda from_local: NcbiManeRefSeqGenomicData(silent=True).get_latest(
48
+ from_local=from_local
49
+ )[0],
50
+ ),
40
51
  DataFile.LRG_REFSEQGENE: (
41
52
  "LRG_REFSEQGENE_PATH",
42
53
  lambda from_local: NcbiLrgRefSeqGeneData(silent=True).get_latest(
@@ -53,6 +64,7 @@ def get_data_file(resource: DataFile, from_local: bool = False) -> Path:
53
64
 
54
65
  * ``Resource.TRANSCRIPT_MAPPINGS`` -> ``TRANSCRIPT_MAPPINGS_PATH``
55
66
  * ``Resource.MANE_SUMMARY`` -> ``MANE_SUMMARY_PATH``
67
+ * ``Resource.MANE_REFSEQ_GENOMIC`` -> ``MANE_REFSEQ_GENOMIC_PATH``
56
68
  * ``Resource.LRG_REFSEQGENE`` -> ``LRG_REFSEQGENE_PATH``
57
69
 
58
70
  Otherwise, this function falls back on default expected locations:
@@ -4,6 +4,7 @@ import datetime
4
4
  from enum import Enum, IntEnum
5
5
  from typing import Literal
6
6
 
7
+ from ga4gh.vrs.models import SequenceLocation
7
8
  from pydantic import (
8
9
  BaseModel,
9
10
  ConfigDict,
@@ -167,3 +168,37 @@ class ServiceMeta(BaseModelForbidExtra):
167
168
  }
168
169
  }
169
170
  )
171
+
172
+
173
+ class CdsOverlap(BaseModelForbidExtra):
174
+ """Create model for representing CDS start/stop and Overlap start/stop"""
175
+
176
+ cds: SequenceLocation
177
+ overlap: SequenceLocation
178
+
179
+ model_config = ConfigDict(
180
+ json_schema_extra={
181
+ "example": {
182
+ "cds": {
183
+ "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
184
+ "type": "SequenceLocation",
185
+ "sequenceReference": {
186
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
187
+ "type": "SequenceReference",
188
+ },
189
+ "start": 140726493,
190
+ "end": 140726516,
191
+ },
192
+ "overlap": {
193
+ "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
194
+ "type": "SequenceLocation",
195
+ "sequenceReference": {
196
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
197
+ "type": "SequenceReference",
198
+ },
199
+ "start": 140726493,
200
+ "end": 140726516,
201
+ },
202
+ }
203
+ }
204
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cool_seq_tool
3
- Version: 0.14.1
3
+ Version: 0.14.2
4
4
  Summary: Common Operation on Lots of Sequences Tool
5
5
  Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
6
6
  License: MIT License
@@ -45,15 +45,13 @@ Requires-Python: >=3.10
45
45
  Description-Content-Type: text/markdown
46
46
  License-File: LICENSE
47
47
  Requires-Dist: asyncpg
48
- Requires-Dist: aiofiles
49
48
  Requires-Dist: boto3
50
49
  Requires-Dist: agct>=0.1.0-dev1
51
50
  Requires-Dist: polars~=1.0
52
- Requires-Dist: hgvs
53
51
  Requires-Dist: biocommons.seqrepo
54
52
  Requires-Dist: pydantic<3.0,>=2.0
55
53
  Requires-Dist: ga4gh.vrs<3.0,>=2.1.3
56
- Requires-Dist: wags-tails~=0.3.2
54
+ Requires-Dist: wags-tails~=0.4.0
57
55
  Requires-Dist: bioutils
58
56
  Provides-Extra: dev
59
57
  Requires-Dist: pre-commit>=4.2.0; extra == "dev"
@@ -64,7 +62,7 @@ Requires-Dist: ruff==0.12.1; extra == "dev"
64
62
  Provides-Extra: tests
65
63
  Requires-Dist: pytest; extra == "tests"
66
64
  Requires-Dist: pytest-cov; extra == "tests"
67
- Requires-Dist: pytest-asyncio==0.18.3; extra == "tests"
65
+ Requires-Dist: pytest-asyncio; extra == "tests"
68
66
  Requires-Dist: mock; extra == "tests"
69
67
  Provides-Extra: docs
70
68
  Requires-Dist: sphinx==6.1.3; extra == "docs"
@@ -39,6 +39,7 @@ src/cool_seq_tool/handlers/seqrepo_access.py
39
39
  src/cool_seq_tool/mappers/__init__.py
40
40
  src/cool_seq_tool/mappers/alignment.py
41
41
  src/cool_seq_tool/mappers/exon_genomic_coords.py
42
+ src/cool_seq_tool/mappers/feature_overlap.py
42
43
  src/cool_seq_tool/mappers/liftover.py
43
44
  src/cool_seq_tool/mappers/mane_transcript.py
44
45
  src/cool_seq_tool/resources/__init__.py
@@ -51,6 +52,7 @@ src/cool_seq_tool/sources/transcript_mappings.py
51
52
  src/cool_seq_tool/sources/uta_database.py
52
53
  tests/conftest.py
53
54
  tests/test_utils.py
55
+ tests/handlers/test_feature_overlap.py
54
56
  tests/handlers/test_seqrepo_access.py
55
57
  tests/mappers/test_alignment.py
56
58
  tests/mappers/test_exon_genomic_coords.py
@@ -1,13 +1,11 @@
1
1
  asyncpg
2
- aiofiles
3
2
  boto3
4
3
  agct>=0.1.0-dev1
5
4
  polars~=1.0
6
- hgvs
7
5
  biocommons.seqrepo
8
6
  pydantic<3.0,>=2.0
9
7
  ga4gh.vrs<3.0,>=2.1.3
10
- wags-tails~=0.3.2
8
+ wags-tails~=0.4.0
11
9
  bioutils
12
10
 
13
11
  [dev]
@@ -29,5 +27,5 @@ sphinx-github-changelog==1.2.1
29
27
  [tests]
30
28
  pytest
31
29
  pytest-cov
32
- pytest-asyncio==0.18.3
30
+ pytest-asyncio
33
31
  mock
@@ -1,7 +1,5 @@
1
1
  """Provide utilities for test cases."""
2
2
 
3
- import asyncio
4
-
5
3
  import pytest
6
4
 
7
5
  from cool_seq_tool import CoolSeqTool
@@ -10,14 +8,6 @@ from cool_seq_tool.schemas import ManeGeneData, Strand
10
8
  from cool_seq_tool.sources.uta_database import GenomicAlnData, GenomicTxMetadata
11
9
 
12
10
 
13
- @pytest.fixture(scope="session")
14
- def event_loop(request):
15
- """Create an instance of the default event loop for each test case."""
16
- loop = asyncio.get_event_loop_policy().new_event_loop()
17
- yield loop
18
- loop.close()
19
-
20
-
21
11
  @pytest.fixture(scope="session")
22
12
  def test_cool_seq_tool():
23
13
  """Create CoolSeqTool test fixture"""
@@ -0,0 +1,492 @@
1
+ """Module for testing Feature Overlap class"""
2
+
3
+ import polars as pl
4
+ import pytest
5
+
6
+ from cool_seq_tool.mappers.feature_overlap import (
7
+ FeatureOverlap,
8
+ FeatureOverlapError,
9
+ )
10
+ from cool_seq_tool.schemas import CoordinateType
11
+
12
+
13
+ @pytest.fixture(scope="module")
14
+ def test_feature_overlap(test_seqrepo_access):
15
+ """Build Feature Overlap test fixture"""
16
+ return FeatureOverlap(test_seqrepo_access)
17
+
18
+
19
+ def test_df(test_feature_overlap):
20
+ """Test that the dataframe contains correct data"""
21
+ # We only store CDS data
22
+ assert list(test_feature_overlap.df["type"].unique()) == ["CDS"]
23
+
24
+ assert set(test_feature_overlap.df.columns) == {
25
+ "type",
26
+ "chromosome",
27
+ "cds_start",
28
+ "cds_stop",
29
+ "info_name",
30
+ "gene",
31
+ }
32
+
33
+ assert test_feature_overlap.df["cds_start"].dtype == pl.Int64
34
+ assert test_feature_overlap.df["cds_stop"].dtype == pl.Int64
35
+
36
+ assert set(test_feature_overlap.df["chromosome"].unique()) == {
37
+ "1",
38
+ "2",
39
+ "3",
40
+ "4",
41
+ "5",
42
+ "6",
43
+ "7",
44
+ "8",
45
+ "9",
46
+ "10",
47
+ "11",
48
+ "12",
49
+ "13",
50
+ "14",
51
+ "15",
52
+ "16",
53
+ "17",
54
+ "18",
55
+ "19",
56
+ "20",
57
+ "21",
58
+ "22",
59
+ "X",
60
+ "Y",
61
+ }
62
+
63
+
64
+ def test_get_chr_from_alt_ac(test_feature_overlap):
65
+ """Test that _get_chr_from_alt_ac works correctly"""
66
+ resp = test_feature_overlap._get_chr_from_alt_ac("NC_000001.11")
67
+ assert resp == "1"
68
+
69
+ resp = test_feature_overlap._get_chr_from_alt_ac("NC_000023.11")
70
+ assert resp == "X"
71
+
72
+ # identifier is invalid (no version)
73
+ with pytest.raises(FeatureOverlapError) as e:
74
+ test_feature_overlap._get_chr_from_alt_ac("NC_000001")
75
+ assert str(e.value) == "SeqRepo unable to get translated identifiers for NC_000001"
76
+
77
+ # identifier is grch37
78
+ with pytest.raises(FeatureOverlapError) as e:
79
+ test_feature_overlap._get_chr_from_alt_ac("NC_000001.10")
80
+ assert str(e.value) == "Unable to find GRCh38 aliases for: NC_000001.10"
81
+
82
+
83
+ def test_get_grch38_cds_overlap(test_feature_overlap):
84
+ """Test that get_grch38_mane_gene_cds_overlap works correctly"""
85
+ # Variant fully contains exon (negative strand)
86
+ resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(
87
+ 140726490, 140726520, identifier="ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul"
88
+ )
89
+ assert resp == {
90
+ "BRAF": [
91
+ {
92
+ "cds": {
93
+ "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
94
+ "digest": "fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
95
+ "type": "SequenceLocation",
96
+ "sequenceReference": {
97
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
98
+ "type": "SequenceReference",
99
+ },
100
+ "start": 140726493,
101
+ "end": 140726516,
102
+ },
103
+ "overlap": {
104
+ "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
105
+ "digest": "fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
106
+ "type": "SequenceLocation",
107
+ "sequenceReference": {
108
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
109
+ "type": "SequenceReference",
110
+ },
111
+ "start": 140726493,
112
+ "end": 140726516,
113
+ },
114
+ }
115
+ ]
116
+ }
117
+
118
+ expected = {
119
+ "BRAF": [
120
+ {
121
+ "cds": {
122
+ "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
123
+ "digest": "fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
124
+ "type": "SequenceLocation",
125
+ "sequenceReference": {
126
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
127
+ "type": "SequenceReference",
128
+ },
129
+ "start": 140726493,
130
+ "end": 140726516,
131
+ },
132
+ "overlap": {
133
+ "id": "ga4gh:SL.rMJlP7STVHBdvcCMgHkA4XJXXIdXnsix",
134
+ "digest": "rMJlP7STVHBdvcCMgHkA4XJXXIdXnsix",
135
+ "type": "SequenceLocation",
136
+ "sequenceReference": {
137
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
138
+ "type": "SequenceReference",
139
+ },
140
+ "start": 140726500,
141
+ "end": 140726501,
142
+ },
143
+ }
144
+ ]
145
+ }
146
+
147
+ # Using residue (start == stop)
148
+ resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(
149
+ 140726501,
150
+ 140726501,
151
+ identifier="ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
152
+ coordinate_type=CoordinateType.RESIDUE,
153
+ )
154
+ assert resp == expected
155
+
156
+ # Using inter-residue
157
+ resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(
158
+ 140726500,
159
+ 140726501,
160
+ identifier="ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
161
+ coordinate_type=CoordinateType.INTER_RESIDUE,
162
+ )
163
+ assert resp == expected
164
+
165
+ # Variant is fully contained within exon (positive strand)
166
+ resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(
167
+ 55019308, 55019341, chromosome="7"
168
+ )
169
+ assert resp == {
170
+ "EGFR": [
171
+ {
172
+ "cds": {
173
+ "id": "ga4gh:SL.vjxcgicBFEkN8b8AXhagvUDC7FZgZgCp",
174
+ "digest": "vjxcgicBFEkN8b8AXhagvUDC7FZgZgCp",
175
+ "type": "SequenceLocation",
176
+ "sequenceReference": {
177
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
178
+ "type": "SequenceReference",
179
+ },
180
+ "start": 55019277,
181
+ "end": 55019365,
182
+ },
183
+ "overlap": {
184
+ "id": "ga4gh:SL.a_MHSA9TJ5zMkxd52eBuRUNb5ZIXHH7T",
185
+ "digest": "a_MHSA9TJ5zMkxd52eBuRUNb5ZIXHH7T",
186
+ "type": "SequenceLocation",
187
+ "sequenceReference": {
188
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
189
+ "type": "SequenceReference",
190
+ },
191
+ "start": 55019307,
192
+ "end": 55019341,
193
+ },
194
+ }
195
+ ]
196
+ }
197
+
198
+ # Variant partially overlaps with exon, from the exon's start side (negative strand)
199
+ resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(
200
+ 140726503, 140726520, chromosome="7"
201
+ )
202
+ assert resp == {
203
+ "BRAF": [
204
+ {
205
+ "cds": {
206
+ "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
207
+ "digest": "fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
208
+ "type": "SequenceLocation",
209
+ "sequenceReference": {
210
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
211
+ "type": "SequenceReference",
212
+ },
213
+ "start": 140726493,
214
+ "end": 140726516,
215
+ },
216
+ "overlap": {
217
+ "id": "ga4gh:SL.MdSOBEGp0l8wT3y1taeRvVIEi_XDBIGK",
218
+ "digest": "MdSOBEGp0l8wT3y1taeRvVIEi_XDBIGK",
219
+ "type": "SequenceLocation",
220
+ "sequenceReference": {
221
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
222
+ "type": "SequenceReference",
223
+ },
224
+ "start": 140726502,
225
+ "end": 140726516,
226
+ },
227
+ }
228
+ ]
229
+ }
230
+
231
+ # Variant partially overlaps with exon, from the exon's stop side (negative strand)
232
+ resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(
233
+ 140726490, 140726505, identifier="NC_000007.14"
234
+ )
235
+ assert resp == {
236
+ "BRAF": [
237
+ {
238
+ "cds": {
239
+ "id": "ga4gh:SL.fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
240
+ "digest": "fYRYzNIAoe6UQF9MT1XaYsFscoU68ZJv",
241
+ "type": "SequenceLocation",
242
+ "sequenceReference": {
243
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
244
+ "type": "SequenceReference",
245
+ },
246
+ "start": 140726493,
247
+ "end": 140726516,
248
+ },
249
+ "overlap": {
250
+ "id": "ga4gh:SL.Rjvup1y8hPgveXiYnj7dipqYkt3BaFZE",
251
+ "digest": "Rjvup1y8hPgveXiYnj7dipqYkt3BaFZE",
252
+ "type": "SequenceLocation",
253
+ "sequenceReference": {
254
+ "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
255
+ "type": "SequenceReference",
256
+ },
257
+ "start": 140726493,
258
+ "end": 140726505,
259
+ },
260
+ }
261
+ ]
262
+ }
263
+
264
+ # Variant overlaps with multiple exons (positive strand)
265
+ resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(
266
+ 21522390, 21523491, chromosome="Y"
267
+ )
268
+ assert resp == {
269
+ "RBMY1B": [
270
+ {
271
+ "cds": {
272
+ "id": "ga4gh:SL.3fbgdG4Z2a1fqkqkY8M2bQMfBhJsVr_i",
273
+ "digest": "3fbgdG4Z2a1fqkqkY8M2bQMfBhJsVr_i",
274
+ "type": "SequenceLocation",
275
+ "sequenceReference": {
276
+ "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
277
+ "type": "SequenceReference",
278
+ },
279
+ "start": 21522382,
280
+ "end": 21522493,
281
+ },
282
+ "overlap": {
283
+ "id": "ga4gh:SL.XSqmOKSXECFtfjhcvVDmga72xQ0jVGO7",
284
+ "digest": "XSqmOKSXECFtfjhcvVDmga72xQ0jVGO7",
285
+ "type": "SequenceLocation",
286
+ "sequenceReference": {
287
+ "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
288
+ "type": "SequenceReference",
289
+ },
290
+ "start": 21522389,
291
+ "end": 21522493,
292
+ },
293
+ },
294
+ {
295
+ "cds": {
296
+ "id": "ga4gh:SL.wi_fCVQHmZCOUf--3UPKm6johAu3zQYJ",
297
+ "digest": "wi_fCVQHmZCOUf--3UPKm6johAu3zQYJ",
298
+ "type": "SequenceLocation",
299
+ "sequenceReference": {
300
+ "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
301
+ "type": "SequenceReference",
302
+ },
303
+ "start": 21522934,
304
+ "end": 21523045,
305
+ },
306
+ "overlap": {
307
+ "id": "ga4gh:SL.wi_fCVQHmZCOUf--3UPKm6johAu3zQYJ",
308
+ "digest": "wi_fCVQHmZCOUf--3UPKm6johAu3zQYJ",
309
+ "type": "SequenceLocation",
310
+ "sequenceReference": {
311
+ "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
312
+ "type": "SequenceReference",
313
+ },
314
+ "start": 21522934,
315
+ "end": 21523045,
316
+ },
317
+ },
318
+ {
319
+ "cds": {
320
+ "id": "ga4gh:SL.cnULzfdHZPiY6rSQP2Prfzocf1_YOBGA",
321
+ "digest": "cnULzfdHZPiY6rSQP2Prfzocf1_YOBGA",
322
+ "type": "SequenceLocation",
323
+ "sequenceReference": {
324
+ "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
325
+ "type": "SequenceReference",
326
+ },
327
+ "start": 21523479,
328
+ "end": 21523590,
329
+ },
330
+ "overlap": {
331
+ "id": "ga4gh:SL.kHLoQs5cjOqjztK9yZyyVR9ScKCM1S8f",
332
+ "digest": "kHLoQs5cjOqjztK9yZyyVR9ScKCM1S8f",
333
+ "type": "SequenceLocation",
334
+ "sequenceReference": {
335
+ "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
336
+ "type": "SequenceReference",
337
+ },
338
+ "start": 21523479,
339
+ "end": 21523491,
340
+ },
341
+ },
342
+ ]
343
+ }
344
+
345
+ # Variant overlaps with multiple exons (negative strand)
346
+ resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(
347
+ 154779177, 154781317, chromosome="X"
348
+ )
349
+ assert resp == {
350
+ "MPP1": [
351
+ {
352
+ "cds": {
353
+ "id": "ga4gh:SL.dnHzxh-VwjVdanLcvKkI1otKhZeY223-",
354
+ "digest": "dnHzxh-VwjVdanLcvKkI1otKhZeY223-",
355
+ "type": "SequenceLocation",
356
+ "sequenceReference": {
357
+ "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
358
+ "type": "SequenceReference",
359
+ },
360
+ "start": 154781238,
361
+ "end": 154781313,
362
+ },
363
+ "overlap": {
364
+ "id": "ga4gh:SL.dnHzxh-VwjVdanLcvKkI1otKhZeY223-",
365
+ "digest": "dnHzxh-VwjVdanLcvKkI1otKhZeY223-",
366
+ "type": "SequenceLocation",
367
+ "sequenceReference": {
368
+ "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
369
+ "type": "SequenceReference",
370
+ },
371
+ "start": 154781238,
372
+ "end": 154781313,
373
+ },
374
+ },
375
+ {
376
+ "cds": {
377
+ "id": "ga4gh:SL.Z4jQtiT0-FplZWGVA1wNhdCaCMKGQ17D",
378
+ "digest": "Z4jQtiT0-FplZWGVA1wNhdCaCMKGQ17D",
379
+ "type": "SequenceLocation",
380
+ "sequenceReference": {
381
+ "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
382
+ "type": "SequenceReference",
383
+ },
384
+ "start": 154779176,
385
+ "end": 154779353,
386
+ },
387
+ "overlap": {
388
+ "id": "ga4gh:SL.Z4jQtiT0-FplZWGVA1wNhdCaCMKGQ17D",
389
+ "digest": "Z4jQtiT0-FplZWGVA1wNhdCaCMKGQ17D",
390
+ "type": "SequenceLocation",
391
+ "sequenceReference": {
392
+ "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
393
+ "type": "SequenceReference",
394
+ },
395
+ "start": 154779176,
396
+ "end": 154779353,
397
+ },
398
+ },
399
+ ]
400
+ }
401
+
402
+ # Variant overlap with cds in multiple genes and alt chromosome accession
403
+ # chr19_KI270930v1_alt with exact start/stop CDS
404
+ resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(
405
+ 135329, 135381, chromosome="19"
406
+ )
407
+ expected = {
408
+ "KIR2DL5B": [
409
+ {
410
+ "cds": {
411
+ "id": "ga4gh:SL.tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K",
412
+ "digest": "tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K",
413
+ "type": "SequenceLocation",
414
+ "sequenceReference": {
415
+ "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
416
+ "type": "SequenceReference",
417
+ },
418
+ "start": 135328,
419
+ "end": 135381,
420
+ },
421
+ "overlap": {
422
+ "id": "ga4gh:SL.tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K",
423
+ "digest": "tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K",
424
+ "type": "SequenceLocation",
425
+ "sequenceReference": {
426
+ "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
427
+ "type": "SequenceReference",
428
+ },
429
+ "start": 135328,
430
+ "end": 135381,
431
+ },
432
+ }
433
+ ],
434
+ "FCGBP": [
435
+ {
436
+ "cds": {
437
+ "id": "ga4gh:SL.3G3gZfvJ56-y-TRWNSAUHUmyPi_8X3qK",
438
+ "digest": "3G3gZfvJ56-y-TRWNSAUHUmyPi_8X3qK",
439
+ "type": "SequenceLocation",
440
+ "sequenceReference": {
441
+ "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
442
+ "type": "SequenceReference",
443
+ },
444
+ "start": 135263,
445
+ "end": 135807,
446
+ },
447
+ "overlap": {
448
+ "id": "ga4gh:SL.tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K",
449
+ "digest": "tR0TL0hHD3udyK9at0snGQ3zNSmhCz6K",
450
+ "type": "SequenceLocation",
451
+ "sequenceReference": {
452
+ "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
453
+ "type": "SequenceReference",
454
+ },
455
+ "start": 135328,
456
+ "end": 135381,
457
+ },
458
+ }
459
+ ],
460
+ }
461
+ assert resp == expected
462
+
463
+ # Using inter-residue (start != stop)
464
+ resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(
465
+ 135328, 135381, chromosome="19", coordinate_type=CoordinateType.INTER_RESIDUE
466
+ )
467
+ assert resp == expected
468
+
469
+ # No overlap found
470
+ resp = test_feature_overlap.get_grch38_mane_gene_cds_overlap(1, 2, chromosome="19")
471
+ assert resp is None
472
+
473
+ # Testing invalid
474
+
475
+ # chromosome does not match regex pattern
476
+ with pytest.raises(FeatureOverlapError) as e:
477
+ test_feature_overlap.get_grch38_mane_gene_cds_overlap(
478
+ 154779177, 154781317, chromosome="chrX"
479
+ )
480
+ assert str(e.value) == "`chromosome` must be 1, ..., 22, X, or Y"
481
+
482
+ # identifier is GRCh37
483
+ with pytest.raises(FeatureOverlapError) as e:
484
+ test_feature_overlap.get_grch38_mane_gene_cds_overlap(
485
+ 154779177, 154781317, identifier="NC_000023.10"
486
+ )
487
+ assert str(e.value) == "Unable to find GRCh38 aliases for: NC_000023.10"
488
+
489
+ # no identifier or chromosome provided
490
+ with pytest.raises(FeatureOverlapError) as e:
491
+ test_feature_overlap.get_grch38_mane_gene_cds_overlap(154779177, 154781317)
492
+ assert str(e.value) == "Must provide either `chromosome` or `identifier`"
@@ -298,8 +298,8 @@ async def test_g_to_c(
298
298
 
299
299
  def test_set_liftover(test_mane_transcript, genomic_tx_data):
300
300
  """Test that _set_liftover works correctly."""
301
- cpy = genomic_tx_data.copy(deep=True)
302
- expected = genomic_tx_data.copy(deep=True)
301
+ cpy = genomic_tx_data.model_copy(deep=True)
302
+ expected = genomic_tx_data.model_copy(deep=True)
303
303
  test_mane_transcript._set_liftover(cpy, "alt_pos_range", "chr7", "GRCh38")
304
304
  expected.alt_pos_range = (140739811, 140739946)
305
305
  assert cpy == expected
@@ -311,8 +311,8 @@ def test_set_liftover(test_mane_transcript, genomic_tx_data):
311
311
  @pytest.mark.asyncio
312
312
  async def test_liftover_to_38(test_mane_transcript, genomic_tx_data):
313
313
  """Test that liftover_to_38 works correctly."""
314
- cpy = genomic_tx_data.copy(deep=True)
315
- expected = genomic_tx_data.copy(deep=True)
314
+ cpy = genomic_tx_data.model_copy(deep=True)
315
+ expected = genomic_tx_data.model_copy(deep=True)
316
316
  await test_mane_transcript._liftover_to_38(cpy)
317
317
  expected.alt_ac = "NC_000007.14"
318
318
  expected.alt_pos_change_range = (140739903, 140739903)
@@ -675,7 +675,7 @@ async def test_g_to_grch38(test_mane_transcript, grch38_egfr, grch38_braf):
675
675
  resp = await test_mane_transcript.g_to_grch38(
676
676
  "NC_000007.13", 55259515, 55259515, get_mane_genes=False
677
677
  )
678
- grch38_egfr_no_genes = grch38_egfr.copy()
678
+ grch38_egfr_no_genes = grch38_egfr.model_copy()
679
679
  grch38_egfr_no_genes.mane_genes = []
680
680
  assert resp == grch38_egfr_no_genes
681
681
 
File without changes
File without changes
File without changes