protpen 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ # protpen/cli_consolidate_foldseek.py
2
+ import argparse
3
+ from protpen.foldseek_utils import consolidate_foldseek_results
4
+
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument("input_dir", help="Directory with Foldseek .tsv files")
9
+ parser.add_argument("output_file", help="Output consolidated .tsv file")
10
+ parser.add_argument("query_fasta", help="FASTA file with query protein IDs")
11
+ parser.add_argument(
12
+ "--top_x", type=int, default=5, help="Top X hits per query (default: 5)"
13
+ )
14
+ parser.add_argument(
15
+ "--max_workers",
16
+ type=int,
17
+ default=8,
18
+ help="Number of concurrent file-reading threads",
19
+ )
20
+ args = parser.parse_args()
21
+
22
+ df = consolidate_foldseek_results(
23
+ args.input_dir, args.query_fasta, args.top_x, max_workers=args.max_workers
24
+ )
25
+ if not df.empty:
26
+ df.to_csv(args.output_file, sep="\t", index=False)
27
+ print(f"Saved {len(df)} entries to {args.output_file}")
28
+ else:
29
+ print("No valid results found. No output written.")
30
+
31
+
32
+ if __name__ == "__main__":
33
+ main()
@@ -0,0 +1,33 @@
1
+ # protpen/cli_download.py
2
+
3
+ import argparse
4
+ from protpen.downloader import download_structures_from_fasta
5
+
6
+
7
+ def main():
8
+ parser = argparse.ArgumentParser(
9
+ description="Download AlphaFold PDB files from UniProt FASTA."
10
+ )
11
+ parser.add_argument("file_in", help="Input FASTA file with UniProt IDs")
12
+ parser.add_argument(
13
+ "--output_folder", default="pdb_files", help="Folder to save PDB files"
14
+ )
15
+ parser.add_argument(
16
+ "--max_workers",
17
+ type=int,
18
+ default=16,
19
+ help="Number of concurrent download threads",
20
+ )
21
+ args = parser.parse_args()
22
+
23
+ result = download_structures_from_fasta(
24
+ args.file_in, args.output_folder, max_workers=args.max_workers
25
+ )
26
+ for pid, status in result.items():
27
+ print(f"{pid}: {status}")
28
+
29
+
30
+ if __name__ == "__main__":
31
+ main()
32
+
33
+ # python protpen/cli_download.py input.fasta --output_folder pdbs
protpen/cli_eggnog.py ADDED
@@ -0,0 +1,42 @@
1
+ # protpen/cli_eggnog.py
2
+ import argparse
3
+ from protpen.eggnog import run_eggnog_mapper, convert_to_tsv
4
+
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser(
8
+ description="Run EggNOG-mapper and convert output to TSV"
9
+ )
10
+ parser.add_argument(
11
+ "-i", "--input_fasta", default="input_proteins.fasta", help="Input FASTA file"
12
+ )
13
+ parser.add_argument(
14
+ "-o", "--output_dir", default="eggnog_output", help="Output directory"
15
+ )
16
+ parser.add_argument(
17
+ "-p", "--output_prefix", default="test_proteins", help="Output prefix"
18
+ )
19
+ parser.add_argument(
20
+ "-t", "--output_tsv", default="eggnog_output.tsv", help="Output TSV path"
21
+ )
22
+ parser.add_argument("--emapper_path", required=True, help="Path to emapper.py")
23
+ parser.add_argument(
24
+ "--cpu",
25
+ type=int,
26
+ default=None,
27
+ help="Number of CPUs for EggNOG-mapper (diamond/mmseqs --cpu)",
28
+ )
29
+
30
+ args = parser.parse_args()
31
+ run_eggnog_mapper(
32
+ args.input_fasta,
33
+ args.output_dir,
34
+ args.output_prefix,
35
+ args.emapper_path,
36
+ cpu=args.cpu,
37
+ )
38
+ convert_to_tsv(args.output_dir, args.output_prefix, args.output_tsv)
39
+
40
+
41
+ if __name__ == "__main__":
42
+ main()
protpen/cli_enrich.py ADDED
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/env python3
2
+ # protpen/cli_enrich.py
3
+
4
+ # usage: python -m protpen.cli_enrich -i input.tsv -o enriched_output.tsv
5
+
6
+ import argparse
7
+ import csv
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from protpen import enrich_utils
10
+
11
+
12
+ def run_enrichment_pipeline(input_tsv, output_tsv, max_workers=16):
13
+ # Step 1: Extract all unique (pdb_id, chain_id) pairs from Foldseek target column
14
+ pdb_chain_pairs = set()
15
+ with open(input_tsv, newline="") as infile:
16
+ reader = csv.DictReader(infile, delimiter="\t")
17
+ for row in reader:
18
+ target_field = row.get("target", "")
19
+ for token in target_field.split("||"):
20
+ if token:
21
+ (pdb_id, chain_id), reason = enrich_utils.parse_pdb_chain(token)
22
+ if pdb_id and chain_id:
23
+ pdb_chain_pairs.add((pdb_id, chain_id))
24
+ else:
25
+ print(f"[WARN] Skipping unparsable token: {token} — {reason}")
26
+
27
+ print(f"[INFO] Found {len(pdb_chain_pairs)} unique PDB+chain combinations.")
28
+
29
+ if not pdb_chain_pairs:
30
+ print("[ERROR] No valid PDB+chain pairs extracted. Exiting.")
31
+ return
32
+
33
+ # Step 2: Map (pdb_id, chain_id) → UniProt ID via RCSB GraphQL.
34
+ # Each lookup is an independent network call, so run them concurrently.
35
+ pair_to_uniprot = {}
36
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
37
+ future_to_pair = {
38
+ executor.submit(enrich_utils.get_uniprot_id_from_rcsb, pdb_id, chain_id): (
39
+ pdb_id,
40
+ chain_id,
41
+ )
42
+ for pdb_id, chain_id in sorted(pdb_chain_pairs)
43
+ }
44
+ for future in as_completed(future_to_pair):
45
+ pdb_id, chain_id = future_to_pair[future]
46
+ uniprot_id = future.result()
47
+ if uniprot_id:
48
+ pair_to_uniprot[(pdb_id, chain_id)] = uniprot_id
49
+ else:
50
+ print(f"[WARN] No UniProt ID found for {pdb_id}_{chain_id}")
51
+
52
+ print(f"[INFO] Retrieved UniProt IDs for {len(pair_to_uniprot)} pairs.")
53
+
54
+ # Step 3: Retrieve UniProt metadata for each unique UniProt ID concurrently.
55
+ unique_uniprot_ids = sorted(set(pair_to_uniprot.values()))
56
+ uniprot_info_cache = {}
57
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
58
+ future_to_id = {
59
+ executor.submit(enrich_utils.get_uniprot_info, uniprot_id): uniprot_id
60
+ for uniprot_id in unique_uniprot_ids
61
+ }
62
+ for future in as_completed(future_to_id):
63
+ uniprot_id = future_to_id[future]
64
+ uniprot_info_cache[uniprot_id] = future.result()
65
+
66
+ pair_to_info = {
67
+ pair: uniprot_info_cache[uniprot_id]
68
+ for pair, uniprot_id in pair_to_uniprot.items()
69
+ }
70
+
71
+ # Step 4: Fallback for any pair that couldn't be resolved
72
+ for pair in pdb_chain_pairs:
73
+ if pair not in pair_to_info:
74
+ pair_to_info[pair] = {
75
+ "description": "n/a",
76
+ "interpro": "n/a",
77
+ "supfam": "n/a",
78
+ }
79
+
80
+ # Step 5: Enrich and write TSV
81
+ print(f"[INFO] Writing enriched TSV to {output_tsv}")
82
+ enrich_utils.enrich_tsv(input_tsv, output_tsv, pair_to_info=pair_to_info)
83
+ print("[INFO] Enrichment complete.")
84
+
85
+
86
+ def main():
87
+ parser = argparse.ArgumentParser(
88
+ description="Enrich Foldseek TSVs using PDB+chain-specific UniProt metadata from RCSB GraphQL."
89
+ )
90
+ parser.add_argument(
91
+ "-i", "--input", required=True, help="Input Foldseek TSV (with target column)"
92
+ )
93
+ parser.add_argument(
94
+ "-o", "--output", required=True, help="Output TSV with UniProt enrichment"
95
+ )
96
+ parser.add_argument(
97
+ "--max_workers",
98
+ type=int,
99
+ default=16,
100
+ help="Number of concurrent lookup threads",
101
+ )
102
+ args = parser.parse_args()
103
+
104
+ run_enrichment_pipeline(args.input, args.output, max_workers=args.max_workers)
105
+
106
+
107
+ if __name__ == "__main__":
108
+ main()
@@ -0,0 +1,42 @@
1
+ # protpen/cli_foldseek.py
2
+ import argparse
3
+ from protpen.foldseek import run_foldseek_search
4
+
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser(
8
+ description="Run Foldseek search on PDB files in a directory."
9
+ )
10
+ parser.add_argument("pdb_dir", help="Directory containing PDB files.")
11
+ parser.add_argument("output_dir", help="Directory to save output TSV files.")
12
+ parser.add_argument(
13
+ "--tmp_dir", default="tmp", help="Temporary directory for Foldseek."
14
+ )
15
+ parser.add_argument("--db", default="pdb", help="Database to search against.")
16
+ parser.add_argument(
17
+ "--max_workers",
18
+ type=int,
19
+ default=1,
20
+ help="Number of Foldseek searches to run concurrently "
21
+ "(only helps if individual searches don't saturate all CPUs alone).",
22
+ )
23
+ parser.add_argument(
24
+ "--threads",
25
+ type=int,
26
+ default=None,
27
+ help="Passed through as Foldseek's own --threads flag.",
28
+ )
29
+ args = parser.parse_args()
30
+
31
+ run_foldseek_search(
32
+ args.pdb_dir,
33
+ args.output_dir,
34
+ args.tmp_dir,
35
+ args.db,
36
+ max_workers=args.max_workers,
37
+ threads=args.threads,
38
+ )
39
+
40
+
41
+ if __name__ == "__main__":
42
+ main()
protpen/cli_merge.py ADDED
@@ -0,0 +1,32 @@
1
+ # protpen/cli_merge.py
2
+ import argparse
3
+ import csv
4
+ from protpen.merge_utils import read_tsv, merge_data
5
+
6
+
7
+ def main():
8
+ parser = argparse.ArgumentParser(
9
+ description="Merge EggNOG and Foldseek TSVs, filtering EggNOG by Foldseek queries."
10
+ )
11
+ parser.add_argument("eggnog", help="Path to EggNOG TSV file")
12
+ parser.add_argument("foldseek", help="Path to Foldseek TSV file")
13
+ parser.add_argument("output", help="Path for merged TSV output")
14
+ args = parser.parse_args()
15
+
16
+ eggnog_data, eggnog_headers = read_tsv(args.eggnog)
17
+ foldseek_data, foldseek_headers = read_tsv(args.foldseek)
18
+
19
+ merged_headers, merged_rows = merge_data(
20
+ eggnog_data, eggnog_headers, foldseek_data, foldseek_headers
21
+ )
22
+
23
+ with open(args.output, "w", newline="") as f:
24
+ writer = csv.DictWriter(f, fieldnames=merged_headers, delimiter="\t")
25
+ writer.writeheader()
26
+ writer.writerows(merged_rows)
27
+
28
+ print(f"Merged {len(merged_rows)} rows written to {args.output}")
29
+
30
+
31
+ if __name__ == "__main__":
32
+ main()
protpen/downloader.py ADDED
@@ -0,0 +1,201 @@
1
+ # protpen/downloader.py
2
+ import requests
3
+ import os
4
+ import json
5
+ import re
6
+ import time
7
+ import logging
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+
10
+ logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
11
+
12
+
13
+ def _request_with_retry(method, url, retries=5, backoff=1.0, **kwargs):
14
+ """
15
+ Calls requests.get/requests.head with retries on transient connection
16
+ failures (dropped connections, timeouts, 429/5xx). Now that requests
17
+ fan out across many threads instead of one at a time, occasional
18
+ remote disconnects from UniProt/AlphaFoldDB under load are expected
19
+ and shouldn't take down the whole batch.
20
+ """
21
+ func = requests.get if method == "get" else requests.head
22
+ last_exc = None
23
+ for attempt in range(retries):
24
+ try:
25
+ response = func(url, **kwargs)
26
+ except requests.exceptions.RequestException as exc:
27
+ last_exc = exc
28
+ else:
29
+ if response.status_code not in (429, 500, 502, 503, 504):
30
+ return response
31
+ last_exc = None
32
+ if attempt < retries - 1:
33
+ time.sleep(backoff * (2**attempt))
34
+ if last_exc:
35
+ raise last_exc
36
+ return response
37
+
38
+
39
+ def extract_protein_ids_from_fasta(file_in):
40
+ protein_ids = set()
41
+ with open(file_in, "r") as fasta_file:
42
+ for line in fasta_file:
43
+ if line.startswith(">"):
44
+ parts = line.strip().split("|")
45
+ if len(parts) > 2:
46
+ protein_id = parts[1]
47
+ else:
48
+ match = re.match(r"^>(\S+)", line)
49
+ protein_id = match.group(1) if match else None
50
+ if protein_id:
51
+ protein_ids.add(protein_id)
52
+ return list(protein_ids)
53
+
54
+
55
+ def download_uniprot_json(uniprot_id, output_file):
56
+ url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}"
57
+ try:
58
+ response = _request_with_retry("get", url)
59
+ except requests.exceptions.RequestException as exc:
60
+ logging.error(f"UniProt request failed for {uniprot_id} after retries: {exc}")
61
+ return False
62
+ if response.status_code == 200:
63
+ with open(output_file, "w") as f:
64
+ json.dump(response.json(), f, indent=2)
65
+ return True
66
+ return False
67
+
68
+
69
+ def extract_alphafold_id(data):
70
+ for ref in data.get("uniProtKBCrossReferences", []):
71
+ if ref.get("database") == "AlphaFoldDB":
72
+ return ref.get("id", "")
73
+ return ""
74
+
75
+
76
+ def download_alphafold_pdb(alphafold_id, output_folder, batch_size=10, max_version=100):
77
+ """
78
+ Finds and downloads the highest-versioned AlphaFoldDB structure for
79
+ alphafold_id. The actual "latest" version number varies over time and
80
+ across entries (e.g. v4 vs v6), so instead of guessing a version or
81
+ scanning 1..100 sequentially (slow, and pathological for IDs with no
82
+ structure at all), versions are probed in small concurrent batches and
83
+ we stop at the first batch containing a hit.
84
+ """
85
+ pdb_path = os.path.join(output_folder, f"{alphafold_id}.pdb")
86
+ base_url = f"https://alphafold.ebi.ac.uk/files/AF-{alphafold_id}-F1-model_v"
87
+
88
+ def check_version(version):
89
+ url = base_url + str(version) + ".pdb"
90
+ try:
91
+ response = _request_with_retry("head", url)
92
+ except requests.exceptions.RequestException:
93
+ return None
94
+ return version if response.status_code == 200 else None
95
+
96
+ found_version = None
97
+ start = 1
98
+ while start <= max_version:
99
+ batch = range(start, min(start + batch_size, max_version + 1))
100
+ with ThreadPoolExecutor(max_workers=batch_size) as executor:
101
+ hits = [v for v in executor.map(check_version, batch) if v is not None]
102
+ if hits:
103
+ found_version = max(hits)
104
+ break
105
+ start += batch_size
106
+
107
+ if found_version is None:
108
+ logging.warning(
109
+ f"No structure found for {alphafold_id} in versions 1-{max_version}."
110
+ )
111
+ return None
112
+
113
+ url = base_url + str(found_version) + ".pdb"
114
+ logging.info(
115
+ f"Found structure for {alphafold_id} (v{found_version}). Downloading..."
116
+ )
117
+ try:
118
+ response = _request_with_retry("get", url)
119
+ except requests.exceptions.RequestException as exc:
120
+ logging.error(
121
+ f"Failed to download structure for {alphafold_id} after retries: {exc}"
122
+ )
123
+ return None
124
+ os.makedirs(output_folder, exist_ok=True)
125
+ with open(pdb_path, "wb") as f:
126
+ f.write(response.content)
127
+ return pdb_path
128
+
129
+
130
+ def _process_protein(pid, output_folder):
131
+ try:
132
+ return _process_protein_impl(pid, output_folder)
133
+ except Exception as exc:
134
+ # A single protein's network failure shouldn't take down the whole
135
+ # batch -- log it, mark it failed, and let the rest finish.
136
+ logging.error(f"Unexpected error processing {pid}: {exc}")
137
+ return pid, "error"
138
+
139
+
140
+ def _process_protein_impl(pid, output_folder):
141
+ logging.info(f"Processing {pid}...")
142
+ json_path = os.path.join(output_folder, f"{pid}.json")
143
+
144
+ if not os.path.exists(json_path):
145
+ logging.info(f"Downloading UniProt JSON for {pid}")
146
+ success = download_uniprot_json(pid, json_path)
147
+ if not success:
148
+ logging.error(f"Failed to download UniProt JSON for {pid}")
149
+ return pid, "uniprot_json_failed"
150
+
151
+ with open(json_path, "r") as f:
152
+ data = json.load(f)
153
+
154
+ af_id = extract_alphafold_id(data)
155
+ pdb = None
156
+
157
+ if af_id:
158
+ logging.info(f"AlphaFold ID for {pid} is {af_id}")
159
+ pdb = download_alphafold_pdb(af_id, output_folder)
160
+ else:
161
+ logging.warning(f"No AlphaFold ID found in JSON for {pid}")
162
+
163
+ if not pdb:
164
+ logging.info(
165
+ f"Attempting fallback: using UniProt ID {pid} to download structure"
166
+ )
167
+ pdb = download_alphafold_pdb(pid, output_folder)
168
+
169
+ if pdb:
170
+ logging.info(f"Downloaded structure for {pid}")
171
+ return pid, pdb
172
+ else:
173
+ logging.error(f"Failed to download structure for {pid}")
174
+ return pid, "pdb_failed"
175
+
176
+
177
+ def download_structures_from_fasta(file_in, output_folder="pdb_files", max_workers=16):
178
+ if not os.path.exists(output_folder):
179
+ os.makedirs(output_folder)
180
+
181
+ protein_ids = extract_protein_ids_from_fasta(file_in)
182
+ downloaded = {}
183
+
184
+ # These are all independent network I/O calls (UniProt + AlphaFoldDB),
185
+ # so fan them out across threads instead of downloading one protein
186
+ # at a time.
187
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
188
+ future_to_pid = {
189
+ executor.submit(_process_protein, pid, output_folder): pid
190
+ for pid in protein_ids
191
+ }
192
+ for future in as_completed(future_to_pid):
193
+ pid = future_to_pid[future]
194
+ try:
195
+ pid, status = future.result()
196
+ except Exception as exc:
197
+ logging.error(f"Unexpected error processing {pid}: {exc}")
198
+ status = "error"
199
+ downloaded[pid] = status
200
+
201
+ return downloaded
protpen/eggnog.py ADDED
@@ -0,0 +1,51 @@
1
+ # protepen/eggnog.py
2
+ import os
3
+ import subprocess
4
+ import pandas as pd
5
+
6
+
7
+ def run_eggnog_mapper(input_fasta, output_dir, output_prefix, emapper_path, cpu=None):
8
+ if not os.path.exists(output_dir):
9
+ os.makedirs(output_dir)
10
+
11
+ output_path = os.path.join(output_dir, output_prefix)
12
+ command = f"{emapper_path} -i {input_fasta} -m diamond -m mmseqs -o {output_path}"
13
+ if cpu:
14
+ command += f" --cpu {cpu}"
15
+ subprocess.run(command, shell=True, check=True, env=os.environ)
16
+
17
+
18
+ def convert_to_tsv(output_dir, output_prefix, output_file):
19
+ annotation_file = os.path.join(output_dir, f"{output_prefix}.emapper.annotations")
20
+ data = []
21
+
22
+ with open(annotation_file, "r") as f:
23
+ for line in f:
24
+ if not line.startswith("#"):
25
+ data.append(line.strip().split("\t"))
26
+
27
+ columns = [
28
+ "query",
29
+ "seed_ortholog",
30
+ "evalue",
31
+ "score",
32
+ "eggNOG_OGs",
33
+ "max_annot_lvl",
34
+ "COG_category",
35
+ "Description",
36
+ "Preferred_name",
37
+ "GOs",
38
+ "EC",
39
+ "KEGG_ko",
40
+ "KEGG_Pathway",
41
+ "KEGG_Module",
42
+ "KEGG_Reaction",
43
+ "KEGG_rclass",
44
+ "BRITE",
45
+ "KEGG_TC",
46
+ "CAZy",
47
+ "BiGG_Reaction",
48
+ "PFAMs",
49
+ ]
50
+ df = pd.DataFrame(data, columns=columns)
51
+ df.to_csv(output_file, index=False, sep="\t")