proteinspy 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,65 @@
1
+ Metadata-Version: 2.4
2
+ Name: proteinspy
3
+ Version: 1.0.0
4
+ Summary: A Poetry package to analyze protein structure from .cif files
5
+ License: MIT
6
+ Keywords: protein,bioinformatics,cif,pdb,gemmi,structure
7
+ Author: AkhilTeja2209
8
+ Author-email: your_email@example.com
9
+ Requires-Python: >=3.9,<4.0
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
19
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
20
+ Requires-Dist: gemmi
21
+ Requires-Dist: rich
22
+ Project-URL: Homepage, https://akhilteja2209.github.io/Proteinspy/
23
+ Project-URL: Repository, https://github.com/AkhilTeja2209/Proteinspy
24
+ Description-Content-Type: text/markdown
25
+
26
+ # Proteinspy
27
+
28
+ This is a python package based on Poetry.
29
+
30
+ This repository can be used to find the _Resolution_, _Missing residues_, _Ligands_, and _Chains_ in any Protein sample uploaded. The sample must be in a .cif file for the pipeline to run properly.
31
+
32
+ ## 📚 Data
33
+ A sample [protein](https://github.com/AkhilTeja2209/Proteinspy/blob/main/Final_proj_1/10AJ.cif), has been given, and can also be accessed from the [PDB](https://www.rcsb.org), among many other samples.
34
+
35
+ ## 🗒️ Requirements
36
+ python [poetry](https://python-poetry.org/docs/) and its pre-requisites must be available on the system.
37
+
38
+ ## 🚀 To run the pipeline,
39
+
40
+ 1. For a mode based implementation
41
+ ```bash
42
+ cd "<your complete folder path>"
43
+ poetry install #installs all the dependencies of the package
44
+
45
+ poetry run proteinspy analyze 10AJ.cif #replace the protein input name with your own file if you're using a different input or a filename
46
+ ```
47
+
48
+ 2. For an argument based implementation
49
+ ```bash
50
+ cd "<your complete folder path>"
51
+ poetry install #installs all the dependencies of the package
52
+
53
+ poetry run proteinspy resolution 10AJ.cif #gives the resolution of the protein
54
+ poetry run proteinspy ligands 10AJ.cif #gives the ligands in the protein
55
+ poetry run proteinspy missing 10AJ.cif #gives the missing residues in the protein
56
+ poetry run proteinspy chains 10AJ.cif #gives the chains in the protein
57
+ ```
58
+
59
+ ## 📈 Future Enhancements
60
+ - [ ] UI/UX
61
+ - [ ] Containerisation, using docker
62
+
63
+ ## ⚠️ NOTE
64
+ This is an open-source project, and any contribtions, even if not mentioned in the future enhancements section, are welcome.
65
+
@@ -0,0 +1,39 @@
1
+ # Proteinspy
2
+
3
+ This is a python package based on Poetry.
4
+
5
+ This repository can be used to find the _Resolution_, _Missing residues_, _Ligands_, and _Chains_ in any Protein sample uploaded. The sample must be in a .cif file for the pipeline to run properly.
6
+
7
+ ## 📚 Data
8
+ A sample [protein](https://github.com/AkhilTeja2209/Proteinspy/blob/main/Final_proj_1/10AJ.cif), has been given, and can also be accessed from the [PDB](https://www.rcsb.org), among many other samples.
9
+
10
+ ## 🗒️ Requirements
11
+ python [poetry](https://python-poetry.org/docs/) and its pre-requisites must be available on the system.
12
+
13
+ ## 🚀 To run the pipeline,
14
+
15
+ 1. For a mode based implementation
16
+ ```bash
17
+ cd "<your complete folder path>"
18
+ poetry install #installs all the dependencies of the package
19
+
20
+ poetry run proteinspy analyze 10AJ.cif #replace the protein input name with your own file if you're using a different input or a filename
21
+ ```
22
+
23
+ 2. For an argument based implementation
24
+ ```bash
25
+ cd "<your complete folder path>"
26
+ poetry install #installs all the dependencies of the package
27
+
28
+ poetry run proteinspy resolution 10AJ.cif #gives the resolution of the protein
29
+ poetry run proteinspy ligands 10AJ.cif #gives the ligands in the protein
30
+ poetry run proteinspy missing 10AJ.cif #gives the missing residues in the protein
31
+ poetry run proteinspy chains 10AJ.cif #gives the chains in the protein
32
+ ```
33
+
34
+ ## 📈 Future Enhancements
35
+ - [ ] UI/UX
36
+ - [ ] Containerisation, using docker
37
+
38
+ ## ⚠️ NOTE
39
+ This is an open-source project, and any contribtions, even if not mentioned in the future enhancements section, are welcome.
@@ -0,0 +1,3 @@
1
+ from .analysis import get_resolution, get_chains, get_ligands, get_missing_residues
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,108 @@
1
+ import click
2
+ from rich.console import Console
3
+ from rich.table import Table
4
+ from rich import box
5
+
6
+ from .analysis import get_resolution, get_chains, get_ligands, get_missing_residues
7
+
8
+ console = Console()
9
+
10
+
11
+ # ── display helpers ──
12
+
13
+ def show_resolution(path: str) -> None:
14
+ r = get_resolution(path)
15
+ console.print("\n[bold cyan]Resolution[/bold cyan]")
16
+ if r["resolution"]:
17
+ console.print(f" Resolution : [bold]{r['resolution']} {r['unit']}[/bold]")
18
+ else:
19
+ console.print(" Resolution : [yellow]Not available[/yellow]")
20
+ console.print(f" Method : {r['method']}\n")
21
+
22
+ def show_chains(path: str) -> None:
23
+ r = get_chains(path)
24
+ console.print(f"[bold cyan]Chains[/bold cyan] ({r['chain_count']} total)\n")
25
+ t = Table(box=box.SIMPLE_HEAVY, show_header=True, header_style="bold magenta")
26
+ t.add_column("Chain ID", justify="center")
27
+ t.add_column("Type")
28
+ t.add_column("Residues", justify="right")
29
+ for ch in r["chains"]:
30
+ t.add_row(ch["id"], ch["type"], str(ch["residue_count"]))
31
+ console.print(t)
32
+ console.print()
33
+
34
+ def show_ligands(path: str) -> None:
35
+ r = get_ligands(path)
36
+ console.print(f"[bold cyan]Ligands[/bold cyan] ({r['ligand_count']} found)\n")
37
+ if not r["has_ligand"]:
38
+ console.print(" [yellow]No ligands detected.[/yellow]\n")
39
+ return
40
+ t = Table(box=box.SIMPLE_HEAVY, show_header=True, header_style="bold magenta")
41
+ t.add_column("Ligand ID", justify="center")
42
+ t.add_column("Chain", justify="center")
43
+ t.add_column("Seq Num", justify="right")
44
+ for lg in r["ligands"]:
45
+ t.add_row(lg["id"], lg.get("chain", "?"), lg.get("seq_num", "?"))
46
+ console.print(t)
47
+ console.print()
48
+
49
+ def show_missing(path: str) -> None:
50
+ r = get_missing_residues(path)
51
+ console.print(f"[bold cyan]Missing Residues[/bold cyan] ({r['missing_count']} found)\n")
52
+ if r["missing_count"] == 0:
53
+ console.print(" [green]No missing residues.[/green]\n")
54
+ return
55
+ t = Table(box=box.SIMPLE_HEAVY, show_header=True, header_style="bold magenta")
56
+ t.add_column("Chain", justify="center")
57
+ t.add_column("Residue", justify="center")
58
+ t.add_column("Seq #", justify="right")
59
+ for mr in r["missing_residues"]:
60
+ t.add_row(
61
+ mr.get("chain", "?"),
62
+ mr.get("residue", "?"),
63
+ str(mr.get("seq_num", "?")),
64
+ )
65
+ console.print(t)
66
+ console.print()
67
+
68
+
69
+ #CLI commands
70
+
71
+ @click.group()
72
+ def main():
73
+ """proteinspy — Analyse a .cif protein structure file."""
74
+
75
+ @main.command("analyze")
76
+ @click.argument("cif_file")
77
+ def cmd_analyze(cif_file: str):
78
+ """Run all analyses on a .cif file (mode-based)."""
79
+ console.rule(f"[bold blue]proteinspy — {cif_file}[/bold blue]")
80
+ show_resolution(cif_file)
81
+ show_chains(cif_file)
82
+ show_ligands(cif_file)
83
+ show_missing(cif_file)
84
+ console.rule()
85
+
86
+ @main.command("resolution")
87
+ @click.argument("cif_file")
88
+ def cmd_resolution(cif_file: str):
89
+ """Report resolution only."""
90
+ show_resolution(cif_file)
91
+
92
+ @main.command("chains")
93
+ @click.argument("cif_file")
94
+ def cmd_chains(cif_file: str):
95
+ """Report chains only."""
96
+ show_chains(cif_file)
97
+
98
+ @main.command("ligands")
99
+ @click.argument("cif_file")
100
+ def cmd_ligands(cif_file: str):
101
+ """Report ligands only."""
102
+ show_ligands(cif_file)
103
+
104
+ @main.command("missing")
105
+ @click.argument("cif_file")
106
+ def cmd_missing(cif_file: str):
107
+ """Report missing residues only."""
108
+ show_missing(cif_file)
@@ -0,0 +1,146 @@
1
+ import gemmi
2
+
3
+ # Residue types to exclude from ligand detection
4
+ _STANDARD_AA = {
5
+ "ALA","ARG","ASN","ASP","CYS","GLN","GLU","GLY","HIS","ILE",
6
+ "LEU","LYS","MET","PHE","PRO","SER","THR","TRP","TYR","VAL",
7
+ "SEC","PYL","UNK",
8
+ }
9
+ _STANDARD_NUC = {"DA","DC","DG","DT","DI","A","C","G","U","I"}
10
+ _SOLVENT = {
11
+ "HOH","WAT","DOD","SO4","EDO","GOL","PEG","ACT","MPD",
12
+ "PO4","CLR","DMS","FMT","TRS","IOD","BME","EPE"
13
+ }
14
+
15
+
16
+ def get_resolution(path: str) -> dict:
17
+ """Return the resolution of the structure in Angstroms."""
18
+ st = gemmi.read_structure(path)
19
+
20
+ res = st.resolution if st.resolution and st.resolution > 0 else None
21
+
22
+ # Fallback: read directly from CIF tags
23
+ if res is None:
24
+ try:
25
+ block = gemmi.cif.read(path).sole_block()
26
+ for tag in ["_refine.ls_d_res_high",
27
+ "_reflns.d_resolution_high",
28
+ "_em_3d_reconstruction.resolution"]:
29
+ val = block.find_value(tag)
30
+ if val and val not in {"?", "."}:
31
+ res = float(val)
32
+ break
33
+ except Exception:
34
+ pass
35
+
36
+ # Get experimental method
37
+ method = "unknown"
38
+ try:
39
+ block = gemmi.cif.read(path).sole_block()
40
+ m = block.find_value("_exptl.method")
41
+ if m and m not in {"?", "."}:
42
+ method = m.strip().strip("'\"")
43
+ except Exception:
44
+ pass
45
+
46
+ return {"resolution": res, "unit": "Å" if res else None, "method": method}
47
+
48
+
49
+ def get_chains(path: str) -> dict:
50
+ """Return the number and details of chains in the structure."""
51
+ st = gemmi.read_structure(path)
52
+ chains = []
53
+ seen = set()
54
+
55
+ for model in st:
56
+ for chain in model:
57
+ if chain.name in seen:
58
+ continue
59
+ seen.add(chain.name)
60
+ polymer = chain.get_polymer()
61
+ ptype = str(polymer.check_polymer_type()) if len(polymer) > 0 else "non-polymer"
62
+ residue_count = sum(1 for _ in chain)
63
+ chains.append({"id": chain.name, "type": ptype, "residue_count": residue_count})
64
+
65
+ return {"chain_count": len(chains), "chains": chains}
66
+
67
+
68
+ def get_ligands(path: str) -> dict:
69
+ """Return all ligands found in the structure."""
70
+ st = gemmi.read_structure(path)
71
+ found = []
72
+ seen = set()
73
+
74
+ for model in st:
75
+ for chain in model:
76
+ for res in chain:
77
+ if res.entity_type not in (gemmi.EntityType.NonPolymer,
78
+ gemmi.EntityType.Unknown):
79
+ continue
80
+ name = res.name.strip()
81
+ if name in _STANDARD_AA or name in _STANDARD_NUC or name in _SOLVENT:
82
+ continue
83
+ key = f"{name}:{chain.name}:{res.seqid}"
84
+ if key in seen:
85
+ continue
86
+ seen.add(key)
87
+ found.append({"id": name, "chain": chain.name, "seq_num": str(res.seqid)})
88
+
89
+ # Fallback: read _pdbx_entity_nonpoly from CIF
90
+ if not found:
91
+ try:
92
+ block = gemmi.cif.read(path).sole_block()
93
+ table = block.find("_pdbx_entity_nonpoly.", ["name", "comp_id"])
94
+ for row in table:
95
+ comp = row[1].strip().strip("'\"")
96
+ if comp in _SOLVENT or comp in _STANDARD_AA:
97
+ continue
98
+ if comp in seen:
99
+ continue
100
+ seen.add(comp)
101
+ found.append({"id": comp, "chain": "?", "seq_num": "?", "name": row[0].strip().strip("'\"")} )
102
+ except Exception:
103
+ pass
104
+
105
+ return {"ligand_count": len(found), "has_ligand": len(found) > 0, "ligands": found}
106
+
107
+
108
+ def get_missing_residues(path: str) -> dict:
109
+ """Return residues present in the sequence but missing from ATOM records."""
110
+ st = gemmi.read_structure(path)
111
+ missing = []
112
+
113
+ # Method A: compare full entity sequence vs observed ATOM residues
114
+ for model in st:
115
+ for chain in model:
116
+ polymer = chain.get_polymer()
117
+ if len(polymer) == 0:
118
+ continue
119
+ observed = {str(r.label_seq) for r in polymer if r.label_seq is not None}
120
+ entity_id = next((r.entity_id for r in polymer), None)
121
+ if entity_id is None:
122
+ continue
123
+ entity = st.get_entity(entity_id)
124
+ if entity is None:
125
+ continue
126
+ for idx, mon in enumerate(entity.full_sequence, start=1):
127
+ if str(idx) not in observed:
128
+ missing.append({"chain": chain.name, "seq_num": idx, "residue": mon})
129
+
130
+ # Method B: read _pdbx_unobs_or_zero_occ_residues from CIF (more reliable)
131
+ cif_missing = []
132
+ try:
133
+ block = gemmi.cif.read(path).sole_block()
134
+ table = block.find("_pdbx_unobs_or_zero_occ_residues.",
135
+ ["auth_asym_id", "auth_comp_id", "auth_seq_id",
136
+ "PDB_model_num", "polymer_flag"])
137
+ for row in table:
138
+ if row[4].strip() != "Y":
139
+ continue
140
+ cif_missing.append({"chain": row[0].strip(), "residue": row[1].strip(),
141
+ "seq_num": row[2].strip(), "model": row[3].strip()})
142
+ except Exception:
143
+ pass
144
+
145
+ final = cif_missing if cif_missing else missing
146
+ return {"missing_count": len(final), "missing_residues": final}
@@ -0,0 +1,34 @@
1
+ [tool.poetry]
2
+ name = "proteinspy"
3
+ version = "1.0.0"
4
+ description = "A Poetry package to analyze protein structure from .cif files"
5
+ authors = ["AkhilTeja2209 <your_email@example.com>"]
6
+ readme = "README.md"
7
+ license = "MIT"
8
+ homepage = "https://akhilteja2209.github.io/Proteinspy/"
9
+ repository = "https://github.com/AkhilTeja2209/Proteinspy"
10
+ keywords = ["protein", "bioinformatics", "cif", "pdb", "gemmi", "structure"]
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Operating System :: OS Independent",
15
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
16
+ ]
17
+
18
+ packages = [{ include = "proteinspy" }]
19
+
20
+ [tool.poetry.dependencies]
21
+ python = "^3.9"
22
+ gemmi = "*"
23
+ rich = "*"
24
+
25
+ [tool.poetry.group.dev.dependencies]
26
+ mkdocs = "^1.6"
27
+ mkdocs-material = "^9.5"
28
+
29
+ [tool.poetry.scripts]
30
+ proteinspy = "proteinspy.cli:main"
31
+
32
+ [build-system]
33
+ requires = ["poetry-core"]
34
+ build-backend = "poetry.core.masonry.api"