proteinspy 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: proteinspy
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A Poetry package to analyze protein structure from .cif files
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: protein,bioinformatics,cif,pdb,gemmi,structure
|
|
7
|
+
Author: AkhilTeja2209
|
|
8
|
+
Author-email: your_email@example.com
|
|
9
|
+
Requires-Python: >=3.9,<4.0
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Requires-Dist: gemmi
|
|
21
|
+
Requires-Dist: rich
|
|
22
|
+
Project-URL: Homepage, https://akhilteja2209.github.io/Proteinspy/
|
|
23
|
+
Project-URL: Repository, https://github.com/AkhilTeja2209/Proteinspy
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# Proteinspy
|
|
27
|
+
|
|
28
|
+
This is a python package based on Poetry.
|
|
29
|
+
|
|
30
|
+
This repository can be used to find the _Resolution_, _Missing residues_, _Ligands_, and _Chains_ in any Protein sample uploaded. The sample must be in a .cif file for the pipeline to run properly.
|
|
31
|
+
|
|
32
|
+
## 📚 Data
|
|
33
|
+
A sample [protein](https://github.com/AkhilTeja2209/Proteinspy/blob/main/Final_proj_1/10AJ.cif), has been given, and can also be accessed from the [PDB](https://www.rcsb.org), among many other samples.
|
|
34
|
+
|
|
35
|
+
## 🗒️ Requirements
|
|
36
|
+
python [poetry](https://python-poetry.org/docs/) and its pre-requisites must be available on the system.
|
|
37
|
+
|
|
38
|
+
## 🚀 To run the pipeline,
|
|
39
|
+
|
|
40
|
+
1. For a mode based implementation
|
|
41
|
+
```bash
|
|
42
|
+
cd "<your complete folder path>"
|
|
43
|
+
poetry install #installs all the dependencies of the package
|
|
44
|
+
|
|
45
|
+
poetry run proteinspy analyze 10AJ.cif #replace the protein input name with your own file if you're using a different input or a filename
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
2. For an argument based implementation
|
|
49
|
+
```bash
|
|
50
|
+
cd "<your complete folder path>"
|
|
51
|
+
poetry install #installs all the dependencies of the package
|
|
52
|
+
|
|
53
|
+
poetry run proteinspy resolution 10AJ.cif #gives the resolution of the protein
|
|
54
|
+
poetry run proteinspy ligands 10AJ.cif #gives the ligands in the protein
|
|
55
|
+
poetry run proteinspy missing 10AJ.cif #gives the missing residues in the protein
|
|
56
|
+
poetry run proteinspy chains 10AJ.cif #gives the chains in the protein
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## 📈 Future Enhancements
|
|
60
|
+
- [ ] UI/UX
|
|
61
|
+
- [ ] Containerisation, using docker
|
|
62
|
+
|
|
63
|
+
## ⚠️ NOTE
|
|
64
|
+
This is an open-source project, and any contribtions, even if not mentioned in the future enhancements section, are welcome.
|
|
65
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Proteinspy
|
|
2
|
+
|
|
3
|
+
This is a python package based on Poetry.
|
|
4
|
+
|
|
5
|
+
This repository can be used to find the _Resolution_, _Missing residues_, _Ligands_, and _Chains_ in any Protein sample uploaded. The sample must be in a .cif file for the pipeline to run properly.
|
|
6
|
+
|
|
7
|
+
## 📚 Data
|
|
8
|
+
A sample [protein](https://github.com/AkhilTeja2209/Proteinspy/blob/main/Final_proj_1/10AJ.cif), has been given, and can also be accessed from the [PDB](https://www.rcsb.org), among many other samples.
|
|
9
|
+
|
|
10
|
+
## 🗒️ Requirements
|
|
11
|
+
python [poetry](https://python-poetry.org/docs/) and its pre-requisites must be available on the system.
|
|
12
|
+
|
|
13
|
+
## 🚀 To run the pipeline,
|
|
14
|
+
|
|
15
|
+
1. For a mode based implementation
|
|
16
|
+
```bash
|
|
17
|
+
cd "<your complete folder path>"
|
|
18
|
+
poetry install #installs all the dependencies of the package
|
|
19
|
+
|
|
20
|
+
poetry run proteinspy analyze 10AJ.cif #replace the protein input name with your own file if you're using a different input or a filename
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
2. For an argument based implementation
|
|
24
|
+
```bash
|
|
25
|
+
cd "<your complete folder path>"
|
|
26
|
+
poetry install #installs all the dependencies of the package
|
|
27
|
+
|
|
28
|
+
poetry run proteinspy resolution 10AJ.cif #gives the resolution of the protein
|
|
29
|
+
poetry run proteinspy ligands 10AJ.cif #gives the ligands in the protein
|
|
30
|
+
poetry run proteinspy missing 10AJ.cif #gives the missing residues in the protein
|
|
31
|
+
poetry run proteinspy chains 10AJ.cif #gives the chains in the protein
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## 📈 Future Enhancements
|
|
35
|
+
- [ ] UI/UX
|
|
36
|
+
- [ ] Containerisation, using docker
|
|
37
|
+
|
|
38
|
+
## ⚠️ NOTE
|
|
39
|
+
This is an open-source project, and any contribtions, even if not mentioned in the future enhancements section, are welcome.
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import click
|
|
2
|
+
from rich.console import Console
|
|
3
|
+
from rich.table import Table
|
|
4
|
+
from rich import box
|
|
5
|
+
|
|
6
|
+
from .analysis import get_resolution, get_chains, get_ligands, get_missing_residues
|
|
7
|
+
|
|
8
|
+
console = Console()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ── display helpers ──
|
|
12
|
+
|
|
13
|
+
def show_resolution(path: str) -> None:
|
|
14
|
+
r = get_resolution(path)
|
|
15
|
+
console.print("\n[bold cyan]Resolution[/bold cyan]")
|
|
16
|
+
if r["resolution"]:
|
|
17
|
+
console.print(f" Resolution : [bold]{r['resolution']} {r['unit']}[/bold]")
|
|
18
|
+
else:
|
|
19
|
+
console.print(" Resolution : [yellow]Not available[/yellow]")
|
|
20
|
+
console.print(f" Method : {r['method']}\n")
|
|
21
|
+
|
|
22
|
+
def show_chains(path: str) -> None:
|
|
23
|
+
r = get_chains(path)
|
|
24
|
+
console.print(f"[bold cyan]Chains[/bold cyan] ({r['chain_count']} total)\n")
|
|
25
|
+
t = Table(box=box.SIMPLE_HEAVY, show_header=True, header_style="bold magenta")
|
|
26
|
+
t.add_column("Chain ID", justify="center")
|
|
27
|
+
t.add_column("Type")
|
|
28
|
+
t.add_column("Residues", justify="right")
|
|
29
|
+
for ch in r["chains"]:
|
|
30
|
+
t.add_row(ch["id"], ch["type"], str(ch["residue_count"]))
|
|
31
|
+
console.print(t)
|
|
32
|
+
console.print()
|
|
33
|
+
|
|
34
|
+
def show_ligands(path: str) -> None:
|
|
35
|
+
r = get_ligands(path)
|
|
36
|
+
console.print(f"[bold cyan]Ligands[/bold cyan] ({r['ligand_count']} found)\n")
|
|
37
|
+
if not r["has_ligand"]:
|
|
38
|
+
console.print(" [yellow]No ligands detected.[/yellow]\n")
|
|
39
|
+
return
|
|
40
|
+
t = Table(box=box.SIMPLE_HEAVY, show_header=True, header_style="bold magenta")
|
|
41
|
+
t.add_column("Ligand ID", justify="center")
|
|
42
|
+
t.add_column("Chain", justify="center")
|
|
43
|
+
t.add_column("Seq Num", justify="right")
|
|
44
|
+
for lg in r["ligands"]:
|
|
45
|
+
t.add_row(lg["id"], lg.get("chain", "?"), lg.get("seq_num", "?"))
|
|
46
|
+
console.print(t)
|
|
47
|
+
console.print()
|
|
48
|
+
|
|
49
|
+
def show_missing(path: str) -> None:
|
|
50
|
+
r = get_missing_residues(path)
|
|
51
|
+
console.print(f"[bold cyan]Missing Residues[/bold cyan] ({r['missing_count']} found)\n")
|
|
52
|
+
if r["missing_count"] == 0:
|
|
53
|
+
console.print(" [green]No missing residues.[/green]\n")
|
|
54
|
+
return
|
|
55
|
+
t = Table(box=box.SIMPLE_HEAVY, show_header=True, header_style="bold magenta")
|
|
56
|
+
t.add_column("Chain", justify="center")
|
|
57
|
+
t.add_column("Residue", justify="center")
|
|
58
|
+
t.add_column("Seq #", justify="right")
|
|
59
|
+
for mr in r["missing_residues"]:
|
|
60
|
+
t.add_row(
|
|
61
|
+
mr.get("chain", "?"),
|
|
62
|
+
mr.get("residue", "?"),
|
|
63
|
+
str(mr.get("seq_num", "?")),
|
|
64
|
+
)
|
|
65
|
+
console.print(t)
|
|
66
|
+
console.print()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
#CLI commands
|
|
70
|
+
|
|
71
|
+
@click.group()
|
|
72
|
+
def main():
|
|
73
|
+
"""proteinspy — Analyse a .cif protein structure file."""
|
|
74
|
+
|
|
75
|
+
@main.command("analyze")
|
|
76
|
+
@click.argument("cif_file")
|
|
77
|
+
def cmd_analyze(cif_file: str):
|
|
78
|
+
"""Run all analyses on a .cif file (mode-based)."""
|
|
79
|
+
console.rule(f"[bold blue]proteinspy — {cif_file}[/bold blue]")
|
|
80
|
+
show_resolution(cif_file)
|
|
81
|
+
show_chains(cif_file)
|
|
82
|
+
show_ligands(cif_file)
|
|
83
|
+
show_missing(cif_file)
|
|
84
|
+
console.rule()
|
|
85
|
+
|
|
86
|
+
@main.command("resolution")
|
|
87
|
+
@click.argument("cif_file")
|
|
88
|
+
def cmd_resolution(cif_file: str):
|
|
89
|
+
"""Report resolution only."""
|
|
90
|
+
show_resolution(cif_file)
|
|
91
|
+
|
|
92
|
+
@main.command("chains")
|
|
93
|
+
@click.argument("cif_file")
|
|
94
|
+
def cmd_chains(cif_file: str):
|
|
95
|
+
"""Report chains only."""
|
|
96
|
+
show_chains(cif_file)
|
|
97
|
+
|
|
98
|
+
@main.command("ligands")
|
|
99
|
+
@click.argument("cif_file")
|
|
100
|
+
def cmd_ligands(cif_file: str):
|
|
101
|
+
"""Report ligands only."""
|
|
102
|
+
show_ligands(cif_file)
|
|
103
|
+
|
|
104
|
+
@main.command("missing")
|
|
105
|
+
@click.argument("cif_file")
|
|
106
|
+
def cmd_missing(cif_file: str):
|
|
107
|
+
"""Report missing residues only."""
|
|
108
|
+
show_missing(cif_file)
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import gemmi
|
|
2
|
+
|
|
3
|
+
# Residue types to exclude from ligand detection
|
|
4
|
+
_STANDARD_AA = {
|
|
5
|
+
"ALA","ARG","ASN","ASP","CYS","GLN","GLU","GLY","HIS","ILE",
|
|
6
|
+
"LEU","LYS","MET","PHE","PRO","SER","THR","TRP","TYR","VAL",
|
|
7
|
+
"SEC","PYL","UNK",
|
|
8
|
+
}
|
|
9
|
+
_STANDARD_NUC = {"DA","DC","DG","DT","DI","A","C","G","U","I"}
|
|
10
|
+
_SOLVENT = {
|
|
11
|
+
"HOH","WAT","DOD","SO4","EDO","GOL","PEG","ACT","MPD",
|
|
12
|
+
"PO4","CLR","DMS","FMT","TRS","IOD","BME","EPE"
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_resolution(path: str) -> dict:
|
|
17
|
+
"""Return the resolution of the structure in Angstroms."""
|
|
18
|
+
st = gemmi.read_structure(path)
|
|
19
|
+
|
|
20
|
+
res = st.resolution if st.resolution and st.resolution > 0 else None
|
|
21
|
+
|
|
22
|
+
# Fallback: read directly from CIF tags
|
|
23
|
+
if res is None:
|
|
24
|
+
try:
|
|
25
|
+
block = gemmi.cif.read(path).sole_block()
|
|
26
|
+
for tag in ["_refine.ls_d_res_high",
|
|
27
|
+
"_reflns.d_resolution_high",
|
|
28
|
+
"_em_3d_reconstruction.resolution"]:
|
|
29
|
+
val = block.find_value(tag)
|
|
30
|
+
if val and val not in {"?", "."}:
|
|
31
|
+
res = float(val)
|
|
32
|
+
break
|
|
33
|
+
except Exception:
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
# Get experimental method
|
|
37
|
+
method = "unknown"
|
|
38
|
+
try:
|
|
39
|
+
block = gemmi.cif.read(path).sole_block()
|
|
40
|
+
m = block.find_value("_exptl.method")
|
|
41
|
+
if m and m not in {"?", "."}:
|
|
42
|
+
method = m.strip().strip("'\"")
|
|
43
|
+
except Exception:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
return {"resolution": res, "unit": "Å" if res else None, "method": method}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_chains(path: str) -> dict:
|
|
50
|
+
"""Return the number and details of chains in the structure."""
|
|
51
|
+
st = gemmi.read_structure(path)
|
|
52
|
+
chains = []
|
|
53
|
+
seen = set()
|
|
54
|
+
|
|
55
|
+
for model in st:
|
|
56
|
+
for chain in model:
|
|
57
|
+
if chain.name in seen:
|
|
58
|
+
continue
|
|
59
|
+
seen.add(chain.name)
|
|
60
|
+
polymer = chain.get_polymer()
|
|
61
|
+
ptype = str(polymer.check_polymer_type()) if len(polymer) > 0 else "non-polymer"
|
|
62
|
+
residue_count = sum(1 for _ in chain)
|
|
63
|
+
chains.append({"id": chain.name, "type": ptype, "residue_count": residue_count})
|
|
64
|
+
|
|
65
|
+
return {"chain_count": len(chains), "chains": chains}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_ligands(path: str) -> dict:
|
|
69
|
+
"""Return all ligands found in the structure."""
|
|
70
|
+
st = gemmi.read_structure(path)
|
|
71
|
+
found = []
|
|
72
|
+
seen = set()
|
|
73
|
+
|
|
74
|
+
for model in st:
|
|
75
|
+
for chain in model:
|
|
76
|
+
for res in chain:
|
|
77
|
+
if res.entity_type not in (gemmi.EntityType.NonPolymer,
|
|
78
|
+
gemmi.EntityType.Unknown):
|
|
79
|
+
continue
|
|
80
|
+
name = res.name.strip()
|
|
81
|
+
if name in _STANDARD_AA or name in _STANDARD_NUC or name in _SOLVENT:
|
|
82
|
+
continue
|
|
83
|
+
key = f"{name}:{chain.name}:{res.seqid}"
|
|
84
|
+
if key in seen:
|
|
85
|
+
continue
|
|
86
|
+
seen.add(key)
|
|
87
|
+
found.append({"id": name, "chain": chain.name, "seq_num": str(res.seqid)})
|
|
88
|
+
|
|
89
|
+
# Fallback: read _pdbx_entity_nonpoly from CIF
|
|
90
|
+
if not found:
|
|
91
|
+
try:
|
|
92
|
+
block = gemmi.cif.read(path).sole_block()
|
|
93
|
+
table = block.find("_pdbx_entity_nonpoly.", ["name", "comp_id"])
|
|
94
|
+
for row in table:
|
|
95
|
+
comp = row[1].strip().strip("'\"")
|
|
96
|
+
if comp in _SOLVENT or comp in _STANDARD_AA:
|
|
97
|
+
continue
|
|
98
|
+
if comp in seen:
|
|
99
|
+
continue
|
|
100
|
+
seen.add(comp)
|
|
101
|
+
found.append({"id": comp, "chain": "?", "seq_num": "?", "name": row[0].strip().strip("'\"")} )
|
|
102
|
+
except Exception:
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
return {"ligand_count": len(found), "has_ligand": len(found) > 0, "ligands": found}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_missing_residues(path: str) -> dict:
|
|
109
|
+
"""Return residues present in the sequence but missing from ATOM records."""
|
|
110
|
+
st = gemmi.read_structure(path)
|
|
111
|
+
missing = []
|
|
112
|
+
|
|
113
|
+
# Method A: compare full entity sequence vs observed ATOM residues
|
|
114
|
+
for model in st:
|
|
115
|
+
for chain in model:
|
|
116
|
+
polymer = chain.get_polymer()
|
|
117
|
+
if len(polymer) == 0:
|
|
118
|
+
continue
|
|
119
|
+
observed = {str(r.label_seq) for r in polymer if r.label_seq is not None}
|
|
120
|
+
entity_id = next((r.entity_id for r in polymer), None)
|
|
121
|
+
if entity_id is None:
|
|
122
|
+
continue
|
|
123
|
+
entity = st.get_entity(entity_id)
|
|
124
|
+
if entity is None:
|
|
125
|
+
continue
|
|
126
|
+
for idx, mon in enumerate(entity.full_sequence, start=1):
|
|
127
|
+
if str(idx) not in observed:
|
|
128
|
+
missing.append({"chain": chain.name, "seq_num": idx, "residue": mon})
|
|
129
|
+
|
|
130
|
+
# Method B: read _pdbx_unobs_or_zero_occ_residues from CIF (more reliable)
|
|
131
|
+
cif_missing = []
|
|
132
|
+
try:
|
|
133
|
+
block = gemmi.cif.read(path).sole_block()
|
|
134
|
+
table = block.find("_pdbx_unobs_or_zero_occ_residues.",
|
|
135
|
+
["auth_asym_id", "auth_comp_id", "auth_seq_id",
|
|
136
|
+
"PDB_model_num", "polymer_flag"])
|
|
137
|
+
for row in table:
|
|
138
|
+
if row[4].strip() != "Y":
|
|
139
|
+
continue
|
|
140
|
+
cif_missing.append({"chain": row[0].strip(), "residue": row[1].strip(),
|
|
141
|
+
"seq_num": row[2].strip(), "model": row[3].strip()})
|
|
142
|
+
except Exception:
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
final = cif_missing if cif_missing else missing
|
|
146
|
+
return {"missing_count": len(final), "missing_residues": final}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "proteinspy"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "A Poetry package to analyze protein structure from .cif files"
|
|
5
|
+
authors = ["AkhilTeja2209 <your_email@example.com>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
homepage = "https://akhilteja2209.github.io/Proteinspy/"
|
|
9
|
+
repository = "https://github.com/AkhilTeja2209/Proteinspy"
|
|
10
|
+
keywords = ["protein", "bioinformatics", "cif", "pdb", "gemmi", "structure"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
packages = [{ include = "proteinspy" }]
|
|
19
|
+
|
|
20
|
+
[tool.poetry.dependencies]
|
|
21
|
+
python = "^3.9"
|
|
22
|
+
gemmi = "*"
|
|
23
|
+
rich = "*"
|
|
24
|
+
|
|
25
|
+
[tool.poetry.group.dev.dependencies]
|
|
26
|
+
mkdocs = "^1.6"
|
|
27
|
+
mkdocs-material = "^9.5"
|
|
28
|
+
|
|
29
|
+
[tool.poetry.scripts]
|
|
30
|
+
proteinspy = "proteinspy.cli:main"
|
|
31
|
+
|
|
32
|
+
[build-system]
|
|
33
|
+
requires = ["poetry-core"]
|
|
34
|
+
build-backend = "poetry.core.masonry.api"
|