protein-quest 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protein_quest/__init__.py +0 -0
- protein_quest/__version__.py +2 -0
- protein_quest/alphafold/__init__.py +1 -0
- protein_quest/alphafold/confidence.py +226 -0
- protein_quest/alphafold/entry_summary.py +64 -0
- protein_quest/alphafold/fetch.py +534 -0
- protein_quest/cli.py +1428 -0
- protein_quest/converter.py +46 -0
- protein_quest/emdb.py +37 -0
- protein_quest/filters.py +163 -0
- protein_quest/go.py +165 -0
- protein_quest/io.py +350 -0
- protein_quest/mcp_server.py +256 -0
- protein_quest/parallel.py +104 -0
- protein_quest/pdbe/__init__.py +1 -0
- protein_quest/pdbe/fetch.py +68 -0
- protein_quest/py.typed +0 -0
- protein_quest/ss.py +280 -0
- protein_quest/structure.py +232 -0
- protein_quest/taxonomy.py +149 -0
- protein_quest/uniprot.py +975 -0
- protein_quest/utils.py +547 -0
- protein_quest-0.9.0.dist-info/METADATA +325 -0
- protein_quest-0.9.0.dist-info/RECORD +27 -0
- protein_quest-0.9.0.dist-info/WHEEL +4 -0
- protein_quest-0.9.0.dist-info/entry_points.txt +2 -0
- protein_quest-0.9.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Module for searching taxon information from UniProt."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import gzip
|
|
5
|
+
import logging
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Literal, get_args
|
|
8
|
+
|
|
9
|
+
from aiohttp.client import ClientResponse
|
|
10
|
+
from aiohttp_retry import RetryClient
|
|
11
|
+
from cattrs.gen import make_dict_structure_fn, override
|
|
12
|
+
from yarl import URL
|
|
13
|
+
|
|
14
|
+
from protein_quest.converter import converter
|
|
15
|
+
from protein_quest.go import TextIOWrapper
|
|
16
|
+
from protein_quest.utils import friendly_session
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True, slots=True)
|
|
22
|
+
class Taxon:
|
|
23
|
+
"""Dataclass representing a taxon.
|
|
24
|
+
|
|
25
|
+
Arguments:
|
|
26
|
+
taxon_id: The unique identifier for the taxon.
|
|
27
|
+
scientific_name: The scientific name of the taxon.
|
|
28
|
+
rank: The taxonomic rank of the taxon (e.g., species, genus).
|
|
29
|
+
common_name: The common name of the taxon (if available).
|
|
30
|
+
other_names: A set of other names for the taxon (if available).
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
taxon_id: str
|
|
34
|
+
scientific_name: str
|
|
35
|
+
rank: str
|
|
36
|
+
common_name: str | None = None
|
|
37
|
+
other_names: set[str] | None = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass(frozen=True, slots=True)
|
|
41
|
+
class SearchTaxonResponse:
|
|
42
|
+
results: list[Taxon]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
converter.register_structure_hook(
|
|
46
|
+
Taxon,
|
|
47
|
+
make_dict_structure_fn(
|
|
48
|
+
Taxon,
|
|
49
|
+
converter,
|
|
50
|
+
taxon_id=override(rename="taxonId"),
|
|
51
|
+
scientific_name=override(rename="scientificName"),
|
|
52
|
+
common_name=override(rename="commonName"),
|
|
53
|
+
other_names=override(rename="otherNames"),
|
|
54
|
+
),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
SearchField = Literal["tax_id", "scientific", "common", "parent"]
|
|
58
|
+
"""Type of search field"""
|
|
59
|
+
search_fields: set[SearchField | None] = set(get_args(SearchField)) | {None}
|
|
60
|
+
"""Set of valid search fields"""
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _get_next_page(response: ClientResponse) -> URL | str | None:
|
|
64
|
+
next_page = response.links.getone("next", None)
|
|
65
|
+
if next_page is None:
|
|
66
|
+
return None
|
|
67
|
+
return next_page.getone("url", None)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def _fetch_page(url: URL | str, session: RetryClient) -> tuple[list[Taxon], URL | str | None]:
|
|
71
|
+
async with session.get(url) as response:
|
|
72
|
+
response.raise_for_status()
|
|
73
|
+
gzipped_raw_data = await response.read()
|
|
74
|
+
next_page = _get_next_page(response)
|
|
75
|
+
raw_data = gzip.decompress(gzipped_raw_data)
|
|
76
|
+
taxons = converter.loads(raw_data, SearchTaxonResponse).results
|
|
77
|
+
return taxons, next_page
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def search_taxon(query: str, field: SearchField | None = None, limit: int = 100) -> list[Taxon]:
|
|
81
|
+
"""Search for taxon information in UniProt.
|
|
82
|
+
|
|
83
|
+
Uses <https://www.uniprot.org/taxonomy?query=*>.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
query: Search query for the taxon.
|
|
87
|
+
field: Field to search in.
|
|
88
|
+
If None, searches in all fields.
|
|
89
|
+
If "tax_id" then searches by taxon ID.
|
|
90
|
+
If "parent" then given a parent taxon ID returns all its children.
|
|
91
|
+
For example, if the parent taxon ID is 9606 (Human), it will return Neanderthal and Denisovan.
|
|
92
|
+
limit: Maximum number of results to return.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of Taxon objects matching the search query.
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ValueError: If the search field is invalid.
|
|
99
|
+
"""
|
|
100
|
+
# https://rest.uniprot.org/taxonomy/search?compressed=true&format=json&query=%28Mouse%29&size=500
|
|
101
|
+
page_limit = 100
|
|
102
|
+
if field not in search_fields:
|
|
103
|
+
msg = f"Invalid search field: {field}. Must be one of {search_fields}."
|
|
104
|
+
raise ValueError(msg)
|
|
105
|
+
if field is not None:
|
|
106
|
+
# ((common:"house+mouse"))
|
|
107
|
+
query = f'(({field}:"{query}"))'
|
|
108
|
+
params = {"query": query, "limit": str(page_limit), "compressed": "true", "format": "json"}
|
|
109
|
+
url = URL("https://rest.uniprot.org/taxonomy/search").with_query(params)
|
|
110
|
+
logger.debug("Fetching uniprot taxonomy from %s with params %s", url, params)
|
|
111
|
+
async with friendly_session() as session:
|
|
112
|
+
# Fetch first page
|
|
113
|
+
taxons, next_page = await _fetch_page(url, session)
|
|
114
|
+
if len(taxons) >= limit:
|
|
115
|
+
return taxons[:limit]
|
|
116
|
+
if next_page is None:
|
|
117
|
+
return taxons
|
|
118
|
+
|
|
119
|
+
# Fetch next pages
|
|
120
|
+
while next_page:
|
|
121
|
+
logger.debug("Fetching next page of uniprot taxonomy from %s", next_page)
|
|
122
|
+
next_data, next_page = await _fetch_page(next_page, session)
|
|
123
|
+
taxons.extend(next_data)
|
|
124
|
+
if len(taxons) >= limit:
|
|
125
|
+
return taxons[:limit]
|
|
126
|
+
if next_page is None:
|
|
127
|
+
return taxons
|
|
128
|
+
return taxons
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _write_taxonomy_csv(taxons: list[Taxon], output_csv: TextIOWrapper) -> None:
|
|
132
|
+
"""Write taxon information to a CSV file.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
taxons: List of Taxon objects to write to the CSV file.
|
|
136
|
+
output_csv: File object for the output CSV file.
|
|
137
|
+
"""
|
|
138
|
+
writer = csv.writer(output_csv)
|
|
139
|
+
writer.writerow(["taxon_id", "scientific_name", "common_name", "rank", "other_names"])
|
|
140
|
+
for taxon in taxons:
|
|
141
|
+
writer.writerow(
|
|
142
|
+
[
|
|
143
|
+
taxon.taxon_id,
|
|
144
|
+
taxon.scientific_name,
|
|
145
|
+
taxon.common_name,
|
|
146
|
+
taxon.rank,
|
|
147
|
+
";".join(taxon.other_names) if taxon.other_names else "",
|
|
148
|
+
]
|
|
149
|
+
)
|