protein-quest 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
protein_quest/io.py ADDED
@@ -0,0 +1,350 @@
1
+ """Module for structure file input/output."""
2
+
3
+ import gzip
4
+ import logging
5
+ import shutil
6
+ import tempfile
7
+ from collections.abc import Generator, Iterable
8
+ from io import StringIO
9
+ from pathlib import Path
10
+ from typing import Literal, get_args
11
+ from urllib.request import urlopen
12
+
13
+ import gemmi
14
+ from mmcif.api.DictionaryApi import DictionaryApi
15
+ from mmcif.io.BinaryCifReader import BinaryCifReader
16
+ from mmcif.io.BinaryCifWriter import BinaryCifWriter
17
+ from mmcif.io.PdbxReader import PdbxReader
18
+ from mmcif.io.PdbxWriter import PdbxWriter
19
+
20
+ from protein_quest.utils import CopyMethod, copyfile, user_cache_root_dir
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # TODO remove once v0.7.4 of gemmi is released,
25
+ # as uv pip install git+https://github.com/project-gemmi/gemmi.git installs 0.7.4.dev0 which does not print leaks
26
+ # Swallow gemmi leaked function warnings
27
+ gemmi.set_leak_warnings(False)
28
+
29
+
30
+ StructureFileExtensions = Literal[".pdb", ".pdb.gz", ".ent", ".ent.gz", ".cif", ".cif.gz", ".bcif", ".bcif.gz"]
31
+ """Type of supported structure file extensions."""
32
+ valid_structure_file_extensions: set[str] = set(get_args(StructureFileExtensions))
33
+ """Set of valid structure file extensions."""
34
+
35
+
36
+ def write_structure(structure: gemmi.Structure, path: Path):
37
+ """Write a gemmi structure to a file.
38
+
39
+ Args:
40
+ structure: The gemmi structure to write.
41
+ path: The file path to write the structure to.
42
+ The format depends on the file extension.
43
+ See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
44
+ for supported extensions.
45
+
46
+ Raises:
47
+ ValueError: If the file extension is not supported.
48
+ """
49
+ if path.name.endswith(".pdb") or path.name.endswith(".ent"):
50
+ body: str = structure.make_pdb_string()
51
+ path.write_text(body)
52
+ elif path.name.endswith(".pdb.gz") or path.name.endswith(".ent.gz"):
53
+ body: str = structure.make_pdb_string()
54
+ with gzip.open(path, "wt") as f:
55
+ f.write(body)
56
+ elif path.name.endswith(".cif"):
57
+ # do not write chem_comp so it is viewable by molstar
58
+ # see https://github.com/project-gemmi/gemmi/discussions/362
59
+ doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
60
+ doc.write_file(str(path))
61
+ elif path.name.endswith(".cif.gz"):
62
+ doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
63
+ cif_str = doc.as_string()
64
+ with gzip.open(path, "wt") as f:
65
+ f.write(cif_str)
66
+ elif path.name.endswith(".bcif"):
67
+ structure2bcif(structure, path)
68
+ elif path.name.endswith(".bcif.gz"):
69
+ structure2bcifgz(structure, path)
70
+ else:
71
+ msg = f"Unsupported file extension in {path.name}. Supported extensions are: {valid_structure_file_extensions}"
72
+ raise ValueError(msg)
73
+
74
+
75
+ def read_structure(file: Path) -> gemmi.Structure:
76
+ """Read a structure from a file.
77
+
78
+ Args:
79
+ file: Path to the input structure file.
80
+ See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
81
+ for supported extensions.
82
+
83
+ Returns:
84
+ A gemmi Structure object representing the structure in the file.
85
+ """
86
+ if file.name.endswith(".bcif"):
87
+ return bcif2structure(file)
88
+ if file.name.endswith(".bcif.gz"):
89
+ return bcifgz2structure(file)
90
+ return gemmi.read_structure(str(file))
91
+
92
+
93
+ def bcif2cif(bcif_file: Path) -> str:
94
+ """Convert a binary CIF (bcif) file to a CIF string.
95
+
96
+ Args:
97
+ bcif_file: Path to the binary CIF file.
98
+
99
+ Returns:
100
+ A string containing the CIF representation of the structure.
101
+ """
102
+ reader = BinaryCifReader()
103
+ container = reader.deserialize(str(bcif_file))
104
+ capture = StringIO()
105
+ writer = PdbxWriter(capture)
106
+ writer.write(container)
107
+ return capture.getvalue()
108
+
109
+
110
+ def bcifgz2structure(bcif_gz_file: Path) -> gemmi.Structure:
111
+ """Read a binary CIF (bcif) gzipped file and return a gemmi Structure object.
112
+
113
+ This is slower than other formats because gemmi does not support reading bcif files directly.
114
+ So we first gunzip the file to a temporary location, convert it to a cif string using mmcif package,
115
+ and then read the cif string using gemmi.
116
+
117
+ Args:
118
+ bcif_gz_file: Path to the binary CIF gzipped file.
119
+
120
+ Returns:
121
+ A gemmi Structure object representing the structure in the bcif.gz file.
122
+ """
123
+ with tempfile.NamedTemporaryFile(suffix=".bcif", delete=True) as tmp_bcif:
124
+ tmp_path = Path(tmp_bcif.name)
125
+ gunzip_file(bcif_gz_file, output_file=tmp_path, keep_original=True)
126
+ return bcif2structure(tmp_path)
127
+
128
+
129
+ def bcif2structure(bcif_file: Path) -> gemmi.Structure:
130
+ """Read a binary CIF (bcif) file and return a gemmi Structure object.
131
+
132
+ This is slower than other formats because gemmi does not support reading bcif files directly.
133
+ So we convert it to a cif string first using mmcif package and then read the cif string using gemmi.
134
+
135
+ Args:
136
+ bcif_file: Path to the binary CIF file.
137
+
138
+ Returns:
139
+ A gemmi Structure object representing the structure in the bcif file.
140
+ """
141
+ cif_content = bcif2cif(bcif_file)
142
+ doc = gemmi.cif.read_string(cif_content)
143
+ block = doc.sole_block()
144
+ return gemmi.make_structure_from_block(block)
145
+
146
+
147
+ def _initialize_dictionary_api(containers) -> DictionaryApi:
148
+ dict_local = user_cache_root_dir() / "mmcif_pdbx_v5_next.dic"
149
+ if not dict_local.exists():
150
+ dict_url = "https://raw.githubusercontent.com/wwpdb-dictionaries/mmcif_pdbx/master/dist/mmcif_pdbx_v5_next.dic"
151
+ logger.info("Downloading mmcif dictionary from %s to %s", dict_url, dict_local)
152
+ dict_local.parent.mkdir(parents=True, exist_ok=True)
153
+ with dict_local.open("wb") as f, urlopen(dict_url) as response: # noqa: S310 url is hardcoded and https
154
+ f.write(response.read())
155
+ return DictionaryApi(containerList=containers, consolidate=True)
156
+
157
+
158
+ def structure2bcif(structure: gemmi.Structure, bcif_file: Path):
159
+ """Write a gemmi Structure object to a binary CIF (bcif) file.
160
+
161
+ This is slower than other formats because gemmi does not support writing bcif files directly.
162
+ So we convert it to a cif string first using gemmi and then convert cif to bcif using mmcif package.
163
+
164
+ Args:
165
+ structure: The gemmi Structure object to write.
166
+ bcif_file: Path to the output binary CIF file.
167
+ """
168
+ doc = structure.make_mmcif_document(gemmi.MmcifOutputGroups(True, chem_comp=False))
169
+ containers = []
170
+ with StringIO(doc.as_string()) as sio:
171
+ reader = PdbxReader(sio)
172
+ reader.read(containers)
173
+ dict_api = _initialize_dictionary_api(containers)
174
+ writer = BinaryCifWriter(dictionaryApi=dict_api)
175
+ writer.serialize(str(bcif_file), containers)
176
+
177
+
178
+ def gunzip_file(gz_file: Path, output_file: Path | None = None, keep_original: bool = True) -> Path:
179
+ """Unzip a .gz file.
180
+
181
+ Args:
182
+ gz_file: Path to the .gz file.
183
+ output_file: Optional path to the output unzipped file. If None, the .gz suffix is removed from gz_file.
184
+ keep_original: Whether to keep the original .gz file. Default is True.
185
+
186
+ Returns:
187
+ Path to the unzipped file.
188
+
189
+ Raises:
190
+ ValueError: If output_file is None and gz_file does not end with .gz.
191
+ """
192
+ if output_file is None and not gz_file.name.endswith(".gz"):
193
+ msg = f"If output_file is not provided, {gz_file} must end with .gz"
194
+ raise ValueError(msg)
195
+ out_file = output_file or gz_file.with_suffix("")
196
+ with gzip.open(gz_file, "rb") as f_in, out_file.open("wb") as f_out:
197
+ shutil.copyfileobj(f_in, f_out)
198
+ if not keep_original:
199
+ gz_file.unlink()
200
+ return out_file
201
+
202
+
203
+ def structure2bcifgz(structure: gemmi.Structure, bcif_gz_file: Path):
204
+ """Write a gemmi Structure object to a binary CIF gzipped (bcif.gz) file.
205
+
206
+ This is slower than other formats because gemmi does not support writing bcif files directly.
207
+ So we convert it to a cif string first using gemmi and then convert cif to bcif using mmcif package.
208
+ Finally, we gzip the bcif file.
209
+
210
+ Args:
211
+ structure: The gemmi Structure object to write.
212
+ bcif_gz_file: Path to the output binary CIF gzipped file.
213
+ """
214
+ with tempfile.NamedTemporaryFile(suffix=".bcif", delete=True) as tmp_bcif:
215
+ tmp_path = Path(tmp_bcif.name)
216
+ structure2bcif(structure, tmp_path)
217
+ with tmp_path.open("rb") as f_in, gzip.open(bcif_gz_file, "wb") as f_out:
218
+ shutil.copyfileobj(f_in, f_out)
219
+
220
+
221
+ def convert_to_cif_files(
222
+ input_files: Iterable[Path], output_dir: Path, copy_method: CopyMethod
223
+ ) -> Generator[tuple[Path, Path]]:
224
+ """Convert structure files to .cif format.
225
+
226
+ Args:
227
+ input_files: Iterable of structure files to convert.
228
+ output_dir: Directory to save the converted .cif files.
229
+ copy_method: How to copy when no changes are needed to output file.
230
+
231
+ Yields:
232
+ A tuple of the input file and the output file.
233
+ """
234
+ for input_file in input_files:
235
+ output_file = convert_to_cif_file(input_file, output_dir, copy_method)
236
+ yield input_file, output_file
237
+
238
+
239
+ def convert_to_cif_file(input_file: Path, output_dir: Path, copy_method: CopyMethod) -> Path:
240
+ """Convert a single structure file to .cif format.
241
+
242
+ Args:
243
+ input_file: The structure file to convert.
244
+ See [StructureFileExtensions][protein_quest.io.StructureFileExtensions]
245
+ for supported extensions.
246
+ output_dir: Directory to save the converted .cif file.
247
+ copy_method: How to copy when no changes are needed to output file.
248
+
249
+ Returns:
250
+ Path to the converted .cif file.
251
+ """
252
+ name, extension = split_name_and_extension(input_file.name)
253
+ output_file = output_dir / f"{name}.cif"
254
+ if output_file.exists():
255
+ logger.info("Output file %s already exists for input file %s. Skipping.", output_file, input_file)
256
+ elif extension in {".pdb", ".pdb.gz", ".ent", ".ent.gz"}:
257
+ structure = read_structure(input_file)
258
+ write_structure(structure, output_file)
259
+ elif extension == ".cif":
260
+ logger.info("File %s is already in .cif format, copying to %s", input_file, output_dir)
261
+ copyfile(input_file, output_file, copy_method)
262
+ elif extension == ".cif.gz":
263
+ gunzip_file(input_file, output_file=output_file, keep_original=True)
264
+ elif extension == ".bcif":
265
+ with output_file.open("w") as f:
266
+ f.write(bcif2cif(input_file))
267
+ else:
268
+ msg = (
269
+ f"Unsupported file extension {extension} in {input_file}. "
270
+ f"Supported extensions are {valid_structure_file_extensions}."
271
+ )
272
+ raise ValueError(msg)
273
+ return output_file
274
+
275
+
276
+ def split_name_and_extension(name: str) -> tuple[str, str]:
277
+ """Split a filename into its name and extension.
278
+
279
+ `.gz` is considered part of the extension if present.
280
+
281
+ Examples:
282
+ Some example usages.
283
+
284
+ >>> from protein_quest.pdbe.io import split_name_and_extension
285
+ >>> split_name_and_extension("1234.pdb")
286
+ ('1234', '.pdb')
287
+ >>> split_name_and_extension("1234.pdb.gz")
288
+ ('1234', '.pdb.gz')
289
+
290
+ Args:
291
+ name: The filename to split.
292
+
293
+ Returns:
294
+ A tuple containing the name and the extension.
295
+ """
296
+ ext = ""
297
+ if name.endswith(".gz"):
298
+ ext = ".gz"
299
+ name = name.removesuffix(".gz")
300
+ i = name.rfind(".")
301
+ if 0 < i < len(name) - 1:
302
+ ext = name[i:] + ext
303
+ name = name[:i]
304
+ return name, ext
305
+
306
+
307
+ def locate_structure_file(root: Path, pdb_id: str) -> Path:
308
+ """Locate a structure file for a given PDB ID in the specified directory.
309
+
310
+ Uses [StructureFileExtensions][protein_quest.io.StructureFileExtensions] as potential extensions.
311
+ Also tries different casing of the PDB ID.
312
+
313
+ Args:
314
+ root: The root directory to search in.
315
+ pdb_id: The PDB ID to locate.
316
+
317
+ Returns:
318
+ The path to the located structure file.
319
+
320
+ Raises:
321
+ FileNotFoundError: If no structure file is found for the given PDB ID.
322
+ """
323
+ for ext in valid_structure_file_extensions:
324
+ candidates = (
325
+ root / f"{pdb_id}{ext}",
326
+ root / f"{pdb_id.lower()}{ext}",
327
+ root / f"{pdb_id.upper()}{ext}",
328
+ root / f"pdb{pdb_id.lower()}{ext}",
329
+ )
330
+ for candidate in candidates:
331
+ if candidate.exists():
332
+ return candidate
333
+ msg = f"No structure file found for {pdb_id} in {root}"
334
+ raise FileNotFoundError(msg)
335
+
336
+
337
+ def glob_structure_files(input_dir: Path) -> Generator[Path]:
338
+ """Glob for structure files in a directory.
339
+
340
+ Uses [StructureFileExtensions][protein_quest.io.StructureFileExtensions] as valid extensions.
341
+ Does not search recursively.
342
+
343
+ Args:
344
+ input_dir: The input directory to search for structure files.
345
+
346
+ Yields:
347
+ Paths to the found structure files.
348
+ """
349
+ for ext in valid_structure_file_extensions:
350
+ yield from input_dir.glob(f"*{ext}")
@@ -0,0 +1,256 @@
1
+ """MCP server for protein-quest.
2
+
3
+ Can be run with:
4
+
5
+ ```shell
6
+ # for development
7
+ fastmcp dev src/protein_quest/mcp_server.py
8
+ # or from inspector
9
+ npx @modelcontextprotocol/inspector
10
+ # tranport type: stdio
11
+ # comand: protein-quest
12
+ # arguments: mcp
13
+
14
+ # or with server and inspector
15
+ protein-quest mcp --transport streamable-http
16
+ # in another shell
17
+ npx @modelcontextprotocol/inspector
18
+ # transport type: streamable http
19
+ # URL: http://127.0.0.1:8000/mcp
20
+
21
+ # or with copilot in VS code
22
+ # ctrl + shift + p
23
+ # mcp: add server...
24
+ # Choose STDIO
25
+ # command: uv run protein-quest mcp
26
+ # id: protein-quest
27
+ ```
28
+
29
+ Examples:
30
+
31
+ - What are the PDBe structures for `A8MT69` uniprot accession?
32
+
33
+ """
34
+
35
+ from collections.abc import Mapping
36
+ from pathlib import Path
37
+ from textwrap import dedent
38
+ from typing import Annotated
39
+
40
+ from fastmcp import FastMCP
41
+ from pydantic import Field
42
+
43
+ from protein_quest.alphafold.confidence import ConfidenceFilterQuery, ConfidenceFilterResult, filter_file_on_confidence
44
+ from protein_quest.alphafold.fetch import AlphaFoldEntry, DownloadableFormat
45
+ from protein_quest.alphafold.fetch import fetch_many as alphafold_fetch
46
+ from protein_quest.emdb import fetch as emdb_fetch
47
+ from protein_quest.go import search_gene_ontology_term
48
+ from protein_quest.io import convert_to_cif_file, glob_structure_files, read_structure
49
+ from protein_quest.pdbe.fetch import fetch as pdbe_fetch
50
+ from protein_quest.ss import filter_file_on_secondary_structure
51
+ from protein_quest.structure import (
52
+ nr_residues_in_chain,
53
+ structure2uniprot_accessions,
54
+ write_single_chain_structure_file,
55
+ )
56
+ from protein_quest.taxonomy import search_taxon
57
+ from protein_quest.uniprot import (
58
+ PdbResult,
59
+ Query,
60
+ search4af,
61
+ search4emdb,
62
+ search4macromolecular_complexes,
63
+ search4pdb,
64
+ search4uniprot,
65
+ )
66
+
67
+ mcp = FastMCP("protein-quest")
68
+
69
+ # do not want to make dataclasses in non-mcp code into Pydantic models,
70
+ # so we use Annotated here to add description on roots.
71
+
72
+
73
+ @mcp.tool
74
+ def search_uniprot(
75
+ uniprot_query: Annotated[Query, Field(description=Query.__doc__)],
76
+ limit: Annotated[int, Field(gt=0, description="Limit the number of uniprot accessions returned")] = 100,
77
+ ) -> set[str]:
78
+ """Search UniProt for proteins matching the given query."""
79
+ return search4uniprot(uniprot_query, limit=limit)
80
+
81
+
82
+ @mcp.tool
83
+ def search_pdb(
84
+ uniprot_accs: set[str],
85
+ limit: Annotated[int, Field(gt=0, description="Limit the number of entries returned")] = 100,
86
+ ) -> Annotated[
87
+ dict[str, set[PdbResult]],
88
+ Field(
89
+ description=dedent(f"""\
90
+ Dictionary with protein IDs as keys and sets of PDB results as values.
91
+ A PDB result is {PdbResult.__doc__}""")
92
+ ),
93
+ ]:
94
+ """Search PDBe structures for given uniprot accessions."""
95
+ return search4pdb(uniprot_accs, limit=limit)
96
+
97
+
98
+ @mcp.tool
99
+ async def fetch_pdbe_structures(pdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
100
+ """Fetch the PDBe structures for given PDB IDs.
101
+
102
+ Args:
103
+ pdb_ids: A set of PDB IDs.
104
+ save_dir: The directory to save the fetched files.
105
+
106
+ Returns:
107
+ A mapping of PDB ID to the path of the fetched structure file.
108
+ """
109
+ return await pdbe_fetch(pdb_ids, save_dir)
110
+
111
+
112
+ @mcp.tool
113
+ def extract_single_chain_from_structure(
114
+ input_file: Path,
115
+ chain2keep: str,
116
+ output_dir: Path,
117
+ out_chain: str = "A",
118
+ ) -> Path:
119
+ """
120
+ Extract a single chain from a structure (mmCIF or pdb) file and write to a new file.
121
+
122
+ Args:
123
+ input_file: Path to the input structure (mmCIF or pdb) file.
124
+ chain2keep: The chain to keep.
125
+ output_dir: Directory to save the output file.
126
+ out_chain: The chain identifier for the output file.
127
+
128
+ Returns:
129
+ Path to the output structure (mmCIF or pdb) file
130
+ """
131
+ return write_single_chain_structure_file(input_file, chain2keep, output_dir, out_chain)
132
+
133
+
134
+ @mcp.tool
135
+ def list_structure_files(path: Path) -> list[Path]:
136
+ """List structure files (.pdb, .pdb.gz, .cif, .cif.gz, .bcif) in the specified directory."""
137
+ return list(glob_structure_files(path))
138
+
139
+
140
+ # TODO replace remaining decorators with wrapper if tool does single function call
141
+ # so we do not have to replicate docstring,
142
+ # minor con is that it does not show up in api docs
143
+ mcp.tool(nr_residues_in_chain)
144
+ mcp.tool(search_taxon)
145
+ mcp.tool(search_gene_ontology_term)
146
+
147
+
148
+ @mcp.tool
149
+ def search_alphafolds(
150
+ uniprot_accs: set[str],
151
+ limit: Annotated[int, Field(gt=0, description="Limit the number of entries returned")] = 100,
152
+ ) -> Annotated[
153
+ set[str],
154
+ Field(description="Set of uniprot accessions which have an AlphaFold entry"),
155
+ ]:
156
+ """Search for AlphaFold entries in UniProtKB accessions."""
157
+ # each uniprot accession can have one or more AlphaFold IDs
158
+ # an AlphaFold ID is the same as the uniprot accession
159
+ # so we return a subset of uniprot_accs
160
+ results = search4af(uniprot_accs, limit)
161
+ return {k for k, v in results.items() if v}
162
+
163
+
164
+ mcp.tool(search4emdb, name="search_emdb")
165
+ mcp.tool(search4macromolecular_complexes, name="search_macromolecular_complexes")
166
+
167
+
168
+ @mcp.tool
169
+ def fetch_alphafold_structures(uniprot_accs: set[str], save_dir: Path) -> list[AlphaFoldEntry]:
170
+ """Fetch the AlphaFold mmCIF file for given UniProt accessions.
171
+
172
+ Args:
173
+ uniprot_accs: A set of UniProt accessions.
174
+ save_dir: The directory to save the fetched files.
175
+
176
+ Returns:
177
+ A list of AlphaFold entries.
178
+ """
179
+ formats: set[DownloadableFormat] = {"cif"}
180
+ return alphafold_fetch(uniprot_accs, save_dir, formats)
181
+
182
+
183
+ @mcp.tool
184
+ async def fetch_emdb_volumes(emdb_ids: set[str], save_dir: Path) -> Mapping[str, Path]:
185
+ """Fetch EMDB volumes for given EMDB IDs.
186
+
187
+ Args:
188
+ emdb_ids: A set of EMDB IDs.
189
+ save_dir: The directory to save the fetched files.
190
+ Returns:
191
+ A mapping of EMDB ID to the path of the fetched volume file.
192
+ """
193
+ return await emdb_fetch(emdb_ids=emdb_ids, save_dir=save_dir)
194
+
195
+
196
+ @mcp.tool
197
+ def alphafold_confidence_filter(file: Path, query: ConfidenceFilterQuery, filtered_dir: Path) -> ConfidenceFilterResult:
198
+ """Take a mmcif/PDB file and filter it based on confidence (plDDT) scores.
199
+
200
+ If passes filter writes file to filtered_dir with residues above confidence threshold.
201
+ """
202
+ return filter_file_on_confidence(file, query, filtered_dir)
203
+
204
+
205
+ mcp.tool(filter_file_on_secondary_structure)
206
+
207
+ mcp.tool(convert_to_cif_file)
208
+
209
+
210
+ @mcp.tool
211
+ def uniprot_accessions_of_structure_file(file: Path) -> set[str]:
212
+ """Extract UniProt accessions from structure file."""
213
+ structure = read_structure(file)
214
+ return structure2uniprot_accessions(structure)
215
+
216
+
217
+ @mcp.prompt
218
+ def candidate_structures(
219
+ species: str = "Human",
220
+ cellular_location: str = "nucleus",
221
+ confidence: int = 90,
222
+ min_residues: int = 100,
223
+ max_residues: int = 200,
224
+ ) -> str:
225
+ """Prompt to find candidate structures.
226
+
227
+ Args:
228
+ species: The species to search for (default: "Human").
229
+ cellular_location: The cellular location to search for (default: "nucleus").
230
+ confidence: The confidence threshold for AlphaFold structures (default: 90).
231
+ min_residues: Minimum number of high confidence residues (default: 100).
232
+ max_residues: Maximum number of high confidence residues (default: 200).
233
+
234
+ Returns:
235
+ A prompt string to find candidate structures.
236
+ """
237
+ return dedent(f"""\
238
+ Given the species '{species}' and cellular location '{cellular_location}' find the candidate structures.
239
+ Download structures from 2 sources namely PDB and Alphafold.
240
+ For alphafold I only want to use high confidence scores of over {confidence}.
241
+ and only keep structures with number of high confidence residues between {min_residues} and {max_residues}.
242
+
243
+ 1. Search uniprot for proteins related to {species} and {cellular_location}.
244
+ 1. For the species find the NCBI taxonomy id.
245
+ 2. For cellular location find the associated GO term.
246
+ 3. Find uniprot accessions based on NCBI taxonomy id and cellular location GO term.
247
+ 2. For PDB
248
+ 1. Search for structures related to the identified proteins.
249
+ 2. Download each PDB entry from PDBe
250
+ 3. Extract chain for the protein of interest.
251
+ 3. For Alphafold
252
+ 1. Search for AlphaFold entries related to the identified proteins.
253
+ 2. Download each AlphaFold entry.
254
+ 3. Filter the structures based on {confidence} as confidence
255
+ and nr residues between {min_residues} and {max_residues}.
256
+ """)