protein-quest 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of protein-quest might be problematic. Click here for more details.

@@ -0,0 +1,185 @@
1
+ """Module for structure file input/output."""
2
+
3
+ import gzip
4
+ import logging
5
+ from collections.abc import Generator
6
+ from pathlib import Path
7
+
8
+ import gemmi
9
+
10
+ from protein_quest import __version__
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def nr_residues_in_chain(file: Path | str, chain: str = "A") -> int:
16
+ """Returns the number of residues in a specific chain from a mmCIF/pdb file.
17
+
18
+ Args:
19
+ file: Path to the input mmCIF/pdb file.
20
+ chain: Chain to count residues of.
21
+
22
+ Returns:
23
+ The number of residues in the specified chain.
24
+ """
25
+ structure = gemmi.read_structure(str(file))
26
+ model = structure[0]
27
+ gchain = find_chain_in_model(model, chain)
28
+ if gchain is None:
29
+ logger.warning("Chain %s not found in %s. Returning 0.", chain, file)
30
+ return 0
31
+ return len(gchain)
32
+
33
+
34
+ def find_chain_in_model(model: gemmi.Model, wanted_chain: str) -> gemmi.Chain | None:
35
+ chain = model.find_chain(wanted_chain)
36
+ if chain is None:
37
+ # For chain A in 4v92 the find_chain method returns None,
38
+ # however it is prefixed with 'B',
39
+ # so we try again as last char of chain name
40
+ mchains = [c for c in model if c.name.endswith(wanted_chain)]
41
+ if mchains:
42
+ return mchains[0]
43
+ return chain
44
+
45
+
46
+ def write_structure(structure: gemmi.Structure, path: Path):
47
+ """Write a gemmi structure to a file.
48
+
49
+ Args:
50
+ structure: The gemmi structure to write.
51
+ path: The file path to write the structure to.
52
+ The format depends on the file extension.
53
+ Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz.
54
+
55
+ Raises:
56
+ ValueError: If the file extension is not supported.
57
+ """
58
+ if path.name.endswith(".pdb"):
59
+ body: str = structure.make_pdb_string()
60
+ path.write_text(body)
61
+ elif path.name.endswith(".pdb.gz"):
62
+ body: str = structure.make_pdb_string()
63
+ with gzip.open(path, "wt") as f:
64
+ f.write(body)
65
+ elif path.name.endswith(".cif"):
66
+ doc = structure.make_mmcif_document()
67
+ doc.write_file(str(path))
68
+ elif path.name.endswith(".cif.gz"):
69
+ doc = structure.make_mmcif_document()
70
+ cif_str = doc.as_string()
71
+ with gzip.open(path, "wt") as f:
72
+ f.write(cif_str)
73
+ else:
74
+ msg = f"Unsupported file extension in {path.name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
75
+ raise ValueError(msg)
76
+
77
+
78
+ def _split_name_and_extension(name: str) -> tuple[str, str]:
79
+ # 1234.pdb -> (1234, .pdb)
80
+ # 1234.pdb.gz -> (1234, .pdb.gz)
81
+ # 1234.cif -> (1234, .cif)
82
+ # 1234.cif.gz -> (1234, .cif.gz)
83
+ if name.endswith(".pdb.gz"):
84
+ return name.replace(".pdb.gz", ""), ".pdb.gz"
85
+ if name.endswith(".cif.gz"):
86
+ return name.replace(".cif.gz", ""), ".cif.gz"
87
+ if name.endswith(".pdb"):
88
+ return name.replace(".pdb", ""), ".pdb"
89
+ if name.endswith(".cif"):
90
+ return name.replace(".cif", ""), ".cif"
91
+
92
+ msg = f"Unknown file extension in {name}. Supported extensions are .pdb, .pdb.gz, .cif, .cif.gz"
93
+ raise ValueError(msg)
94
+
95
+
96
+ def locate_structure_file(root: Path, pdb_id: str) -> Path:
97
+ """Locate a structure file for a given PDB ID in the specified directory.
98
+
99
+ Args:
100
+ root: The root directory to search in.
101
+ pdb_id: The PDB ID to locate.
102
+
103
+ Returns:
104
+ The path to the located structure file.
105
+
106
+ Raises:
107
+ FileNotFoundError: If no structure file is found for the given PDB ID.
108
+ """
109
+ exts = [".cif.gz", ".cif", ".pdb.gz", ".pdb"]
110
+ # files downloaded from https://www.ebi.ac.uk/pdbe/ website
111
+ # have file names like pdb6t5y.ent or pdb6t5y.ent.gz for a PDB formatted file.
112
+ # TODO support pdb6t5y.ent or pdb6t5y.ent.gz file names
113
+ for ext in exts:
114
+ candidate = root / f"{pdb_id.lower()}{ext}"
115
+ if candidate.exists():
116
+ return candidate
117
+ msg = f"No structure file found for {pdb_id} in {root}"
118
+ raise FileNotFoundError(msg)
119
+
120
+
121
+ def glob_structure_files(input_dir: Path) -> Generator[Path]:
122
+ """Glob for structure files in a directory.
123
+
124
+ Args:
125
+ input_dir: The input directory to search for structure files.
126
+
127
+ Yields:
128
+ Paths to the found structure files.
129
+ """
130
+ for ext in [".cif.gz", ".cif", ".pdb.gz", ".pdb"]:
131
+ yield from input_dir.glob(f"*{ext}")
132
+
133
+
134
+ def write_single_chain_pdb_file(
135
+ input_file: Path, chain2keep: str, output_dir: Path, out_chain: str = "A"
136
+ ) -> Path | None:
137
+ """Write a single chain from a mmCIF/pdb file to a new mmCIF/pdb file.
138
+
139
+ Args:
140
+ input_file: Path to the input mmCIF/pdb file.
141
+ chain2keep: The chain to keep.
142
+ output_dir: Directory to save the output file.
143
+ out_chain: The chain identifier for the output file.
144
+
145
+ Returns:
146
+ Path to the output mmCIF/pdb file or None if not created.
147
+ """
148
+
149
+ structure = gemmi.read_structure(str(input_file))
150
+ model = structure[0]
151
+
152
+ # Only count residues of polymer
153
+ model.remove_ligands_and_waters()
154
+
155
+ chain = find_chain_in_model(model, chain2keep)
156
+ if chain is None:
157
+ logger.warning(
158
+ "Chain %s not found in %s. Skipping.",
159
+ chain2keep,
160
+ input_file,
161
+ )
162
+ return None
163
+ name, extension = _split_name_and_extension(input_file.name)
164
+ output_file = output_dir / f"{name}_{chain.name}2{out_chain}{extension}"
165
+
166
+ new_structure = gemmi.Structure()
167
+ new_structure.resolution = structure.resolution
168
+ new_id = structure.name + f"{chain2keep}2{out_chain}"
169
+ new_structure.name = new_id
170
+ new_structure.info["_entry.id"] = new_id
171
+ new_title = f"From {structure.info['_entry.id']} chain {chain2keep} to {out_chain}"
172
+ new_structure.info["_struct.title"] = new_title
173
+ new_structure.info["_struct_keywords.pdbx_keywords"] = new_title.upper()
174
+ new_si = gemmi.SoftwareItem()
175
+ new_si.classification = gemmi.SoftwareItem.Classification.DataExtraction
176
+ new_si.name = "protein-quest"
177
+ new_si.version = str(__version__)
178
+ new_structure.meta.software.append(new_si)
179
+ new_model = gemmi.Model(1)
180
+ chain.name = out_chain
181
+ new_model.add_chain(chain)
182
+ new_structure.add_model(new_model)
183
+ write_structure(new_structure, output_file)
184
+
185
+ return output_file
protein_quest/py.typed ADDED
File without changes
@@ -0,0 +1,139 @@
1
+ """Module for searching taxon information from UniProt."""
2
+
3
+ import csv
4
+ import gzip
5
+ import logging
6
+ from dataclasses import dataclass
7
+ from typing import Literal, get_args
8
+
9
+ from aiohttp.client import ClientResponse
10
+ from aiohttp_retry import RetryClient
11
+ from cattrs.gen import make_dict_structure_fn, override
12
+ from cattrs.preconf.orjson import make_converter
13
+ from yarl import URL
14
+
15
+ from protein_quest.go import TextIOWrapper
16
+ from protein_quest.utils import friendly_session
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @dataclass(frozen=True, slots=True)
22
+ class Taxon:
23
+ taxon_id: str
24
+ scientific_name: str
25
+ rank: str
26
+ common_name: str | None = None
27
+ other_names: set[str] | None = None
28
+
29
+
30
+ @dataclass(frozen=True, slots=True)
31
+ class SearchTaxonResponse:
32
+ results: list[Taxon]
33
+
34
+
35
+ converter = make_converter()
36
+
37
+ converter.register_structure_hook(
38
+ Taxon,
39
+ make_dict_structure_fn(
40
+ Taxon,
41
+ converter,
42
+ taxon_id=override(rename="taxonId"),
43
+ scientific_name=override(rename="scientificName"),
44
+ common_name=override(rename="commonName"),
45
+ other_names=override(rename="otherNames"),
46
+ ),
47
+ )
48
+
49
+ SearchField = Literal["tax_id", "scientific", "common", "parent"]
50
+ search_fields: set[SearchField | None] = set(get_args(SearchField)) | {None}
51
+
52
+
53
+ def _get_next_page(response: ClientResponse) -> URL | str | None:
54
+ next_page = response.links.getone("next", None)
55
+ if next_page is None:
56
+ return None
57
+ return next_page.getone("url", None)
58
+
59
+
60
+ async def _fetch_page(url: URL | str, session: RetryClient) -> tuple[list[Taxon], URL | str | None]:
61
+ async with session.get(url) as response:
62
+ response.raise_for_status()
63
+ gzipped_raw_data = await response.read()
64
+ next_page = _get_next_page(response)
65
+ raw_data = gzip.decompress(gzipped_raw_data)
66
+ taxons = converter.loads(raw_data, SearchTaxonResponse).results
67
+ return taxons, next_page
68
+
69
+
70
+ async def search_taxon(query: str, field: SearchField | None = None, limit: int = 100) -> list[Taxon]:
71
+ """Search for taxon information in UniProt.
72
+
73
+ Uses <https://www.uniprot.org/taxonomy?query=*>.
74
+
75
+ Args:
76
+ query: Search query for the taxon.
77
+ field: Field to search in.
78
+ If None, searches in all fields.
79
+ If "tax_id" then searches by taxon ID.
80
+ If "parent" then given a parent taxon ID returns all its children.
81
+ For example, if the parent taxon ID is 9606 (Human), it will return Neanderthal and Denisovan.
82
+ limit: Maximum number of results to return.
83
+
84
+ Returns:
85
+ List of Taxon objects matching the search query.
86
+
87
+ Raises:
88
+ ValueError: If the search field is invalid.
89
+ """
90
+ # https://rest.uniprot.org/taxonomy/search?compressed=true&format=json&query=%28Mouse%29&size=500
91
+ page_limit = 100
92
+ if field not in search_fields:
93
+ msg = f"Invalid search field: {field}. Must be one of {search_fields}."
94
+ raise ValueError(msg)
95
+ if field is not None:
96
+ # ((common:"house+mouse"))
97
+ query = f'(({field}:"{query}"))'
98
+ params = {"query": query, "limit": str(page_limit), "compressed": "true", "format": "json"}
99
+ url = URL("https://rest.uniprot.org/taxonomy/search").with_query(params)
100
+ logger.debug("Fetching uniprot taxonomy from %s with params %s", url, params)
101
+ async with friendly_session() as session:
102
+ # Fetch first page
103
+ taxons, next_page = await _fetch_page(url, session)
104
+ if len(taxons) >= limit:
105
+ return taxons[:limit]
106
+ if next_page is None:
107
+ return taxons
108
+
109
+ # Fetch next pages
110
+ while next_page:
111
+ logger.debug("Fetching next page of uniprot taxonomy from %s", next_page)
112
+ next_data, next_page = await _fetch_page(next_page, session)
113
+ taxons.extend(next_data)
114
+ if len(taxons) >= limit:
115
+ return taxons[:limit]
116
+ if next_page is None:
117
+ return taxons
118
+ return taxons
119
+
120
+
121
+ def _write_taxonomy_csv(taxons: list[Taxon], output_csv: TextIOWrapper) -> None:
122
+ """Write taxon information to a CSV file.
123
+
124
+ Args:
125
+ taxons: List of Taxon objects to write to the CSV file.
126
+ output_csv: File object for the output CSV file.
127
+ """
128
+ writer = csv.writer(output_csv)
129
+ writer.writerow(["taxon_id", "scientific_name", "common_name", "rank", "other_names"])
130
+ for taxon in taxons:
131
+ writer.writerow(
132
+ [
133
+ taxon.taxon_id,
134
+ taxon.scientific_name,
135
+ taxon.common_name,
136
+ taxon.rank,
137
+ ";".join(taxon.other_names) if taxon.other_names else "",
138
+ ]
139
+ )