PyPI - protpen - Versions diffs - 1.0.0__py3-none-any.whl - Mend

protpen 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

protpen/cli_consolidate_foldseek.py +33 -0
protpen/cli_download.py +33 -0
protpen/cli_eggnog.py +42 -0
protpen/cli_enrich.py +108 -0
protpen/cli_foldseek.py +42 -0
protpen/cli_merge.py +32 -0
protpen/downloader.py +201 -0
protpen/eggnog.py +51 -0
protpen/enrich_utils.py +259 -0
protpen/foldseek.py +90 -0
protpen/foldseek_utils.py +65 -0
protpen/merge_utils.py +85 -0
protpen-1.0.0.dist-info/METADATA +863 -0
protpen-1.0.0.dist-info/RECORD +16 -0
protpen-1.0.0.dist-info/WHEEL +4 -0
protpen-1.0.0.dist-info/licenses/LICENSE +674 -0

protpen/cli_consolidate_foldseek.py ADDED Viewed

@@ -0,0 +1,33 @@
+# protpen/cli_consolidate_foldseek.py
+import argparse
+from protpen.foldseek_utils import consolidate_foldseek_results
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_dir", help="Directory with Foldseek .tsv files")
+    parser.add_argument("output_file", help="Output consolidated .tsv file")
+    parser.add_argument("query_fasta", help="FASTA file with query protein IDs")
+    parser.add_argument(
+        "--top_x", type=int, default=5, help="Top X hits per query (default: 5)"
+    )
+    parser.add_argument(
+        "--max_workers",
+        type=int,
+        default=8,
+        help="Number of concurrent file-reading threads",
+    )
+    args = parser.parse_args()
+    df = consolidate_foldseek_results(
+        args.input_dir, args.query_fasta, args.top_x, max_workers=args.max_workers
+    )
+    if not df.empty:
+        df.to_csv(args.output_file, sep="\t", index=False)
+        print(f"Saved {len(df)} entries to {args.output_file}")
+    else:
+        print("No valid results found. No output written.")
+if __name__ == "__main__":
+    main()

protpen/cli_download.py ADDED Viewed

@@ -0,0 +1,33 @@
+# protpen/cli_download.py
+import argparse
+from protpen.downloader import download_structures_from_fasta
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download AlphaFold PDB files from UniProt FASTA."
+    )
+    parser.add_argument("file_in", help="Input FASTA file with UniProt IDs")
+    parser.add_argument(
+        "--output_folder", default="pdb_files", help="Folder to save PDB files"
+    )
+    parser.add_argument(
+        "--max_workers",
+        type=int,
+        default=16,
+        help="Number of concurrent download threads",
+    )
+    args = parser.parse_args()
+    result = download_structures_from_fasta(
+        args.file_in, args.output_folder, max_workers=args.max_workers
+    )
+    for pid, status in result.items():
+        print(f"{pid}: {status}")
+if __name__ == "__main__":
+    main()
+# python protpen/cli_download.py input.fasta --output_folder pdbs

protpen/cli_eggnog.py ADDED Viewed

@@ -0,0 +1,42 @@
+# protpen/cli_eggnog.py
+import argparse
+from protpen.eggnog import run_eggnog_mapper, convert_to_tsv
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run EggNOG-mapper and convert output to TSV"
+    )
+    parser.add_argument(
+        "-i", "--input_fasta", default="input_proteins.fasta", help="Input FASTA file"
+    )
+    parser.add_argument(
+        "-o", "--output_dir", default="eggnog_output", help="Output directory"
+    )
+    parser.add_argument(
+        "-p", "--output_prefix", default="test_proteins", help="Output prefix"
+    )
+    parser.add_argument(
+        "-t", "--output_tsv", default="eggnog_output.tsv", help="Output TSV path"
+    )
+    parser.add_argument("--emapper_path", required=True, help="Path to emapper.py")
+    parser.add_argument(
+        "--cpu",
+        type=int,
+        default=None,
+        help="Number of CPUs for EggNOG-mapper (diamond/mmseqs --cpu)",
+    )
+    args = parser.parse_args()
+    run_eggnog_mapper(
+        args.input_fasta,
+        args.output_dir,
+        args.output_prefix,
+        args.emapper_path,
+        cpu=args.cpu,
+    )
+    convert_to_tsv(args.output_dir, args.output_prefix, args.output_tsv)
+if __name__ == "__main__":
+    main()

protpen/cli_enrich.py ADDED Viewed

@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+# protpen/cli_enrich.py
+# usage: python -m protpen.cli_enrich -i input.tsv -o enriched_output.tsv
+import argparse
+import csv
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from protpen import enrich_utils
+def run_enrichment_pipeline(input_tsv, output_tsv, max_workers=16):
+    # Step 1: Extract all unique (pdb_id, chain_id) pairs from Foldseek target column
+    pdb_chain_pairs = set()
+    with open(input_tsv, newline="") as infile:
+        reader = csv.DictReader(infile, delimiter="\t")
+        for row in reader:
+            target_field = row.get("target", "")
+            for token in target_field.split("||"):
+                if token:
+                    (pdb_id, chain_id), reason = enrich_utils.parse_pdb_chain(token)
+                    if pdb_id and chain_id:
+                        pdb_chain_pairs.add((pdb_id, chain_id))
+                    else:
+                        print(f"[WARN] Skipping unparsable token: {token} — {reason}")
+    print(f"[INFO] Found {len(pdb_chain_pairs)} unique PDB+chain combinations.")
+    if not pdb_chain_pairs:
+        print("[ERROR] No valid PDB+chain pairs extracted. Exiting.")
+        return
+    # Step 2: Map (pdb_id, chain_id) → UniProt ID via RCSB GraphQL.
+    # Each lookup is an independent network call, so run them concurrently.
+    pair_to_uniprot = {}
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_pair = {
+            executor.submit(enrich_utils.get_uniprot_id_from_rcsb, pdb_id, chain_id): (
+                pdb_id,
+                chain_id,
+            )
+            for pdb_id, chain_id in sorted(pdb_chain_pairs)
+        }
+        for future in as_completed(future_to_pair):
+            pdb_id, chain_id = future_to_pair[future]
+            uniprot_id = future.result()
+            if uniprot_id:
+                pair_to_uniprot[(pdb_id, chain_id)] = uniprot_id
+            else:
+                print(f"[WARN] No UniProt ID found for {pdb_id}_{chain_id}")
+    print(f"[INFO] Retrieved UniProt IDs for {len(pair_to_uniprot)} pairs.")
+    # Step 3: Retrieve UniProt metadata for each unique UniProt ID concurrently.
+    unique_uniprot_ids = sorted(set(pair_to_uniprot.values()))
+    uniprot_info_cache = {}
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_id = {
+            executor.submit(enrich_utils.get_uniprot_info, uniprot_id): uniprot_id
+            for uniprot_id in unique_uniprot_ids
+        }
+        for future in as_completed(future_to_id):
+            uniprot_id = future_to_id[future]
+            uniprot_info_cache[uniprot_id] = future.result()
+    pair_to_info = {
+        pair: uniprot_info_cache[uniprot_id]
+        for pair, uniprot_id in pair_to_uniprot.items()
+    }
+    # Step 4: Fallback for any pair that couldn't be resolved
+    for pair in pdb_chain_pairs:
+        if pair not in pair_to_info:
+            pair_to_info[pair] = {
+                "description": "n/a",
+                "interpro": "n/a",
+                "supfam": "n/a",
+            }
+    # Step 5: Enrich and write TSV
+    print(f"[INFO] Writing enriched TSV to {output_tsv}")
+    enrich_utils.enrich_tsv(input_tsv, output_tsv, pair_to_info=pair_to_info)
+    print("[INFO] Enrichment complete.")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Enrich Foldseek TSVs using PDB+chain-specific UniProt metadata from RCSB GraphQL."
+    )
+    parser.add_argument(
+        "-i", "--input", required=True, help="Input Foldseek TSV (with target column)"
+    )
+    parser.add_argument(
+        "-o", "--output", required=True, help="Output TSV with UniProt enrichment"
+    )
+    parser.add_argument(
+        "--max_workers",
+        type=int,
+        default=16,
+        help="Number of concurrent lookup threads",
+    )
+    args = parser.parse_args()
+    run_enrichment_pipeline(args.input, args.output, max_workers=args.max_workers)
+if __name__ == "__main__":
+    main()

protpen/cli_foldseek.py ADDED Viewed

@@ -0,0 +1,42 @@
+# protpen/cli_foldseek.py
+import argparse
+from protpen.foldseek import run_foldseek_search
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run Foldseek search on PDB files in a directory."
+    )
+    parser.add_argument("pdb_dir", help="Directory containing PDB files.")
+    parser.add_argument("output_dir", help="Directory to save output TSV files.")
+    parser.add_argument(
+        "--tmp_dir", default="tmp", help="Temporary directory for Foldseek."
+    )
+    parser.add_argument("--db", default="pdb", help="Database to search against.")
+    parser.add_argument(
+        "--max_workers",
+        type=int,
+        default=1,
+        help="Number of Foldseek searches to run concurrently "
+        "(only helps if individual searches don't saturate all CPUs alone).",
+    )
+    parser.add_argument(
+        "--threads",
+        type=int,
+        default=None,
+        help="Passed through as Foldseek's own --threads flag.",
+    )
+    args = parser.parse_args()
+    run_foldseek_search(
+        args.pdb_dir,
+        args.output_dir,
+        args.tmp_dir,
+        args.db,
+        max_workers=args.max_workers,
+        threads=args.threads,
+    )
+if __name__ == "__main__":
+    main()

protpen/cli_merge.py ADDED Viewed

@@ -0,0 +1,32 @@
+# protpen/cli_merge.py
+import argparse
+import csv
+from protpen.merge_utils import read_tsv, merge_data
+def main():
+    parser = argparse.ArgumentParser(
+        description="Merge EggNOG and Foldseek TSVs, filtering EggNOG by Foldseek queries."
+    )
+    parser.add_argument("eggnog", help="Path to EggNOG TSV file")
+    parser.add_argument("foldseek", help="Path to Foldseek TSV file")
+    parser.add_argument("output", help="Path for merged TSV output")
+    args = parser.parse_args()
+    eggnog_data, eggnog_headers = read_tsv(args.eggnog)
+    foldseek_data, foldseek_headers = read_tsv(args.foldseek)
+    merged_headers, merged_rows = merge_data(
+        eggnog_data, eggnog_headers, foldseek_data, foldseek_headers
+    )
+    with open(args.output, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=merged_headers, delimiter="\t")
+        writer.writeheader()
+        writer.writerows(merged_rows)
+    print(f"Merged {len(merged_rows)} rows written to {args.output}")
+if __name__ == "__main__":
+    main()

protpen/downloader.py ADDED Viewed

@@ -0,0 +1,201 @@
+# protpen/downloader.py
+import requests
+import os
+import json
+import re
+import time
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
+def _request_with_retry(method, url, retries=5, backoff=1.0, **kwargs):
+    """
+    Calls requests.get/requests.head with retries on transient connection
+    failures (dropped connections, timeouts, 429/5xx). Now that requests
+    fan out across many threads instead of one at a time, occasional
+    remote disconnects from UniProt/AlphaFoldDB under load are expected
+    and shouldn't take down the whole batch.
+    """
+    func = requests.get if method == "get" else requests.head
+    last_exc = None
+    for attempt in range(retries):
+        try:
+            response = func(url, **kwargs)
+        except requests.exceptions.RequestException as exc:
+            last_exc = exc
+        else:
+            if response.status_code not in (429, 500, 502, 503, 504):
+                return response
+            last_exc = None
+        if attempt < retries - 1:
+            time.sleep(backoff * (2**attempt))
+    if last_exc:
+        raise last_exc
+    return response
+def extract_protein_ids_from_fasta(file_in):
+    protein_ids = set()
+    with open(file_in, "r") as fasta_file:
+        for line in fasta_file:
+            if line.startswith(">"):
+                parts = line.strip().split("|")
+                if len(parts) > 2:
+                    protein_id = parts[1]
+                else:
+                    match = re.match(r"^>(\S+)", line)
+                    protein_id = match.group(1) if match else None
+                if protein_id:
+                    protein_ids.add(protein_id)
+    return list(protein_ids)
+def download_uniprot_json(uniprot_id, output_file):
+    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}"
+    try:
+        response = _request_with_retry("get", url)
+    except requests.exceptions.RequestException as exc:
+        logging.error(f"UniProt request failed for {uniprot_id} after retries: {exc}")
+        return False
+    if response.status_code == 200:
+        with open(output_file, "w") as f:
+            json.dump(response.json(), f, indent=2)
+        return True
+    return False
+def extract_alphafold_id(data):
+    for ref in data.get("uniProtKBCrossReferences", []):
+        if ref.get("database") == "AlphaFoldDB":
+            return ref.get("id", "")
+    return ""
+def download_alphafold_pdb(alphafold_id, output_folder, batch_size=10, max_version=100):
+    """
+    Finds and downloads the highest-versioned AlphaFoldDB structure for
+    alphafold_id. The actual "latest" version number varies over time and
+    across entries (e.g. v4 vs v6), so instead of guessing a version or
+    scanning 1..100 sequentially (slow, and pathological for IDs with no
+    structure at all), versions are probed in small concurrent batches and
+    we stop at the first batch containing a hit.
+    """
+    pdb_path = os.path.join(output_folder, f"{alphafold_id}.pdb")
+    base_url = f"https://alphafold.ebi.ac.uk/files/AF-{alphafold_id}-F1-model_v"
+    def check_version(version):
+        url = base_url + str(version) + ".pdb"
+        try:
+            response = _request_with_retry("head", url)
+        except requests.exceptions.RequestException:
+            return None
+        return version if response.status_code == 200 else None
+    found_version = None
+    start = 1
+    while start <= max_version:
+        batch = range(start, min(start + batch_size, max_version + 1))
+        with ThreadPoolExecutor(max_workers=batch_size) as executor:
+            hits = [v for v in executor.map(check_version, batch) if v is not None]
+        if hits:
+            found_version = max(hits)
+            break
+        start += batch_size
+    if found_version is None:
+        logging.warning(
+            f"No structure found for {alphafold_id} in versions 1-{max_version}."
+        )
+        return None
+    url = base_url + str(found_version) + ".pdb"
+    logging.info(
+        f"Found structure for {alphafold_id} (v{found_version}). Downloading..."
+    )
+    try:
+        response = _request_with_retry("get", url)
+    except requests.exceptions.RequestException as exc:
+        logging.error(
+            f"Failed to download structure for {alphafold_id} after retries: {exc}"
+        )
+        return None
+    os.makedirs(output_folder, exist_ok=True)
+    with open(pdb_path, "wb") as f:
+        f.write(response.content)
+    return pdb_path
+def _process_protein(pid, output_folder):
+    try:
+        return _process_protein_impl(pid, output_folder)
+    except Exception as exc:
+        # A single protein's network failure shouldn't take down the whole
+        # batch -- log it, mark it failed, and let the rest finish.
+        logging.error(f"Unexpected error processing {pid}: {exc}")
+        return pid, "error"
+def _process_protein_impl(pid, output_folder):
+    logging.info(f"Processing {pid}...")
+    json_path = os.path.join(output_folder, f"{pid}.json")
+    if not os.path.exists(json_path):
+        logging.info(f"Downloading UniProt JSON for {pid}")
+        success = download_uniprot_json(pid, json_path)
+        if not success:
+            logging.error(f"Failed to download UniProt JSON for {pid}")
+            return pid, "uniprot_json_failed"
+    with open(json_path, "r") as f:
+        data = json.load(f)
+    af_id = extract_alphafold_id(data)
+    pdb = None
+    if af_id:
+        logging.info(f"AlphaFold ID for {pid} is {af_id}")
+        pdb = download_alphafold_pdb(af_id, output_folder)
+    else:
+        logging.warning(f"No AlphaFold ID found in JSON for {pid}")
+    if not pdb:
+        logging.info(
+            f"Attempting fallback: using UniProt ID {pid} to download structure"
+        )
+        pdb = download_alphafold_pdb(pid, output_folder)
+    if pdb:
+        logging.info(f"Downloaded structure for {pid}")
+        return pid, pdb
+    else:
+        logging.error(f"Failed to download structure for {pid}")
+        return pid, "pdb_failed"
+def download_structures_from_fasta(file_in, output_folder="pdb_files", max_workers=16):
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    protein_ids = extract_protein_ids_from_fasta(file_in)
+    downloaded = {}
+    # These are all independent network I/O calls (UniProt + AlphaFoldDB),
+    # so fan them out across threads instead of downloading one protein
+    # at a time.
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_pid = {
+            executor.submit(_process_protein, pid, output_folder): pid
+            for pid in protein_ids
+        }
+        for future in as_completed(future_to_pid):
+            pid = future_to_pid[future]
+            try:
+                pid, status = future.result()
+            except Exception as exc:
+                logging.error(f"Unexpected error processing {pid}: {exc}")
+                status = "error"
+            downloaded[pid] = status
+    return downloaded

protpen/eggnog.py ADDED Viewed

@@ -0,0 +1,51 @@
+# protepen/eggnog.py
+import os
+import subprocess
+import pandas as pd
+def run_eggnog_mapper(input_fasta, output_dir, output_prefix, emapper_path, cpu=None):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    output_path = os.path.join(output_dir, output_prefix)
+    command = f"{emapper_path} -i {input_fasta} -m diamond -m mmseqs -o {output_path}"
+    if cpu:
+        command += f" --cpu {cpu}"
+    subprocess.run(command, shell=True, check=True, env=os.environ)
+def convert_to_tsv(output_dir, output_prefix, output_file):
+    annotation_file = os.path.join(output_dir, f"{output_prefix}.emapper.annotations")
+    data = []
+    with open(annotation_file, "r") as f:
+        for line in f:
+            if not line.startswith("#"):
+                data.append(line.strip().split("\t"))
+    columns = [
+        "query",
+        "seed_ortholog",
+        "evalue",
+        "score",
+        "eggNOG_OGs",
+        "max_annot_lvl",
+        "COG_category",
+        "Description",
+        "Preferred_name",
+        "GOs",
+        "EC",
+        "KEGG_ko",
+        "KEGG_Pathway",
+        "KEGG_Module",
+        "KEGG_Reaction",
+        "KEGG_rclass",
+        "BRITE",
+        "KEGG_TC",
+        "CAZy",
+        "BiGG_Reaction",
+        "PFAMs",
+    ]
+    df = pd.DataFrame(data, columns=columns)
+    df.to_csv(output_file, index=False, sep="\t")