protein-quest 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,149 @@
1
+ """Module for searching taxon information from UniProt."""
2
+
3
+ import csv
4
+ import gzip
5
+ import logging
6
+ from dataclasses import dataclass
7
+ from typing import Literal, get_args
8
+
9
+ from aiohttp.client import ClientResponse
10
+ from aiohttp_retry import RetryClient
11
+ from cattrs.gen import make_dict_structure_fn, override
12
+ from yarl import URL
13
+
14
+ from protein_quest.converter import converter
15
+ from protein_quest.go import TextIOWrapper
16
+ from protein_quest.utils import friendly_session
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @dataclass(frozen=True, slots=True)
22
+ class Taxon:
23
+ """Dataclass representing a taxon.
24
+
25
+ Arguments:
26
+ taxon_id: The unique identifier for the taxon.
27
+ scientific_name: The scientific name of the taxon.
28
+ rank: The taxonomic rank of the taxon (e.g., species, genus).
29
+ common_name: The common name of the taxon (if available).
30
+ other_names: A set of other names for the taxon (if available).
31
+ """
32
+
33
+ taxon_id: str
34
+ scientific_name: str
35
+ rank: str
36
+ common_name: str | None = None
37
+ other_names: set[str] | None = None
38
+
39
+
40
+ @dataclass(frozen=True, slots=True)
41
+ class SearchTaxonResponse:
42
+ results: list[Taxon]
43
+
44
+
45
+ converter.register_structure_hook(
46
+ Taxon,
47
+ make_dict_structure_fn(
48
+ Taxon,
49
+ converter,
50
+ taxon_id=override(rename="taxonId"),
51
+ scientific_name=override(rename="scientificName"),
52
+ common_name=override(rename="commonName"),
53
+ other_names=override(rename="otherNames"),
54
+ ),
55
+ )
56
+
57
+ SearchField = Literal["tax_id", "scientific", "common", "parent"]
58
+ """Type of search field"""
59
+ search_fields: set[SearchField | None] = set(get_args(SearchField)) | {None}
60
+ """Set of valid search fields"""
61
+
62
+
63
+ def _get_next_page(response: ClientResponse) -> URL | str | None:
64
+ next_page = response.links.getone("next", None)
65
+ if next_page is None:
66
+ return None
67
+ return next_page.getone("url", None)
68
+
69
+
70
+ async def _fetch_page(url: URL | str, session: RetryClient) -> tuple[list[Taxon], URL | str | None]:
71
+ async with session.get(url) as response:
72
+ response.raise_for_status()
73
+ gzipped_raw_data = await response.read()
74
+ next_page = _get_next_page(response)
75
+ raw_data = gzip.decompress(gzipped_raw_data)
76
+ taxons = converter.loads(raw_data, SearchTaxonResponse).results
77
+ return taxons, next_page
78
+
79
+
80
+ async def search_taxon(query: str, field: SearchField | None = None, limit: int = 100) -> list[Taxon]:
81
+ """Search for taxon information in UniProt.
82
+
83
+ Uses <https://www.uniprot.org/taxonomy?query=*>.
84
+
85
+ Args:
86
+ query: Search query for the taxon.
87
+ field: Field to search in.
88
+ If None, searches in all fields.
89
+ If "tax_id" then searches by taxon ID.
90
+ If "parent" then given a parent taxon ID returns all its children.
91
+ For example, if the parent taxon ID is 9606 (Human), it will return Neanderthal and Denisovan.
92
+ limit: Maximum number of results to return.
93
+
94
+ Returns:
95
+ List of Taxon objects matching the search query.
96
+
97
+ Raises:
98
+ ValueError: If the search field is invalid.
99
+ """
100
+ # https://rest.uniprot.org/taxonomy/search?compressed=true&format=json&query=%28Mouse%29&size=500
101
+ page_limit = 100
102
+ if field not in search_fields:
103
+ msg = f"Invalid search field: {field}. Must be one of {search_fields}."
104
+ raise ValueError(msg)
105
+ if field is not None:
106
+ # ((common:"house+mouse"))
107
+ query = f'(({field}:"{query}"))'
108
+ params = {"query": query, "limit": str(page_limit), "compressed": "true", "format": "json"}
109
+ url = URL("https://rest.uniprot.org/taxonomy/search").with_query(params)
110
+ logger.debug("Fetching uniprot taxonomy from %s with params %s", url, params)
111
+ async with friendly_session() as session:
112
+ # Fetch first page
113
+ taxons, next_page = await _fetch_page(url, session)
114
+ if len(taxons) >= limit:
115
+ return taxons[:limit]
116
+ if next_page is None:
117
+ return taxons
118
+
119
+ # Fetch next pages
120
+ while next_page:
121
+ logger.debug("Fetching next page of uniprot taxonomy from %s", next_page)
122
+ next_data, next_page = await _fetch_page(next_page, session)
123
+ taxons.extend(next_data)
124
+ if len(taxons) >= limit:
125
+ return taxons[:limit]
126
+ if next_page is None:
127
+ return taxons
128
+ return taxons
129
+
130
+
131
+ def _write_taxonomy_csv(taxons: list[Taxon], output_csv: TextIOWrapper) -> None:
132
+ """Write taxon information to a CSV file.
133
+
134
+ Args:
135
+ taxons: List of Taxon objects to write to the CSV file.
136
+ output_csv: File object for the output CSV file.
137
+ """
138
+ writer = csv.writer(output_csv)
139
+ writer.writerow(["taxon_id", "scientific_name", "common_name", "rank", "other_names"])
140
+ for taxon in taxons:
141
+ writer.writerow(
142
+ [
143
+ taxon.taxon_id,
144
+ taxon.scientific_name,
145
+ taxon.common_name,
146
+ taxon.rank,
147
+ ";".join(taxon.other_names) if taxon.other_names else "",
148
+ ]
149
+ )