protqc 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protqc-0.1.0/LICENSE +21 -0
- protqc-0.1.0/PKG-INFO +247 -0
- protqc-0.1.0/README.md +205 -0
- protqc-0.1.0/protqc/__init__.py +23 -0
- protqc-0.1.0/protqc/chat.py +842 -0
- protqc-0.1.0/protqc/cli.py +908 -0
- protqc-0.1.0/protqc/config.py +100 -0
- protqc-0.1.0/protqc/pipeline.py +244 -0
- protqc-0.1.0/protqc/progress.py +127 -0
- protqc-0.1.0/protqc/report.py +674 -0
- protqc-0.1.0/protqc/scoring.py +206 -0
- protqc-0.1.0/protqc/tools/__init__.py +44 -0
- protqc-0.1.0/protqc/tools/_suppress.py +31 -0
- protqc-0.1.0/protqc/tools/fpocket_tool.py +206 -0
- protqc-0.1.0/protqc/tools/freesasa_tool.py +116 -0
- protqc-0.1.0/protqc/tools/hbond_tool.py +129 -0
- protqc-0.1.0/protqc/tools/openmm_tool.py +356 -0
- protqc-0.1.0/protqc/tools/ss_stability_tool.py +125 -0
- protqc-0.1.0/protqc/tools/structure_scorer.py +170 -0
- protqc-0.1.0/protqc/types.py +108 -0
- protqc-0.1.0/protqc.egg-info/PKG-INFO +247 -0
- protqc-0.1.0/protqc.egg-info/SOURCES.txt +40 -0
- protqc-0.1.0/protqc.egg-info/dependency_links.txt +1 -0
- protqc-0.1.0/protqc.egg-info/entry_points.txt +2 -0
- protqc-0.1.0/protqc.egg-info/requires.txt +28 -0
- protqc-0.1.0/protqc.egg-info/top_level.txt +1 -0
- protqc-0.1.0/pyproject.toml +63 -0
- protqc-0.1.0/setup.cfg +4 -0
- protqc-0.1.0/tests/test_chat.py +605 -0
- protqc-0.1.0/tests/test_cli.py +274 -0
- protqc-0.1.0/tests/test_config.py +108 -0
- protqc-0.1.0/tests/test_fpocket_tool.py +300 -0
- protqc-0.1.0/tests/test_freesasa_tool.py +102 -0
- protqc-0.1.0/tests/test_hbond_tool.py +54 -0
- protqc-0.1.0/tests/test_openmm_tool.py +107 -0
- protqc-0.1.0/tests/test_pipeline.py +125 -0
- protqc-0.1.0/tests/test_progress.py +231 -0
- protqc-0.1.0/tests/test_report.py +285 -0
- protqc-0.1.0/tests/test_scoring.py +275 -0
- protqc-0.1.0/tests/test_ss_stability_tool.py +58 -0
- protqc-0.1.0/tests/test_structure_scorer.py +191 -0
- protqc-0.1.0/tests/test_types.py +64 -0
protqc-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Omur Koray Guzel
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
protqc-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: protqc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Physics-based verification of AI-designed protein structures
|
|
5
|
+
Author: Ömür Koray Güzel
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: protein,design,verification,physics,molecular-dynamics,AI
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
13
|
+
Requires-Python: >=3.11
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: pyyaml>=6.0
|
|
17
|
+
Requires-Dist: numpy>=1.26.0
|
|
18
|
+
Requires-Dist: rich>=13.7.0
|
|
19
|
+
Requires-Dist: jinja2>=3.1.0
|
|
20
|
+
Provides-Extra: chat
|
|
21
|
+
Requires-Dist: litellm>=1.40.0; extra == "chat"
|
|
22
|
+
Provides-Extra: all
|
|
23
|
+
Requires-Dist: litellm>=1.40.0; extra == "all"
|
|
24
|
+
Requires-Dist: fair-esm>=2.0.0; extra == "all"
|
|
25
|
+
Requires-Dist: openmm>=8.1.0; extra == "all"
|
|
26
|
+
Requires-Dist: pdbfixer>=1.9; extra == "all"
|
|
27
|
+
Requires-Dist: mdtraj>=1.10.0; extra == "all"
|
|
28
|
+
Requires-Dist: MDAnalysis>=2.7.0; extra == "all"
|
|
29
|
+
Requires-Dist: freesasa>=2.2.0; extra == "all"
|
|
30
|
+
Requires-Dist: biopython>=1.84; extra == "all"
|
|
31
|
+
Requires-Dist: pandas>=2.2.0; extra == "all"
|
|
32
|
+
Requires-Dist: matplotlib>=3.9.0; extra == "all"
|
|
33
|
+
Requires-Dist: seaborn>=0.13.0; extra == "all"
|
|
34
|
+
Requires-Dist: scipy>=1.13.0; extra == "all"
|
|
35
|
+
Requires-Dist: scikit-learn>=1.5.0; extra == "all"
|
|
36
|
+
Requires-Dist: tqdm>=4.66.0; extra == "all"
|
|
37
|
+
Requires-Dist: requests>=2.32.0; extra == "all"
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
|
|
43
|
+
# ProtQC
|
|
44
|
+
|
|
45
|
+
**Physics-based verification of AI-designed protein structures**
|
|
46
|
+
|
|
47
|
+
[](LICENSE)
|
|
48
|
+
[](https://www.python.org/downloads/)
|
|
49
|
+
|
|
50
|
+
*Catches structural hallucinations before wet-lab*
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Why ProtQC?
|
|
55
|
+
|
|
56
|
+
AI protein design tools (AlphaFold, RFdiffusion, ProteinMPNN, BoltzGen) routinely produce structures with high confidence scores (pLDDT > 90) that still fail experimentally. A protein can look perfect by pLDDT yet harbor internal voids, unstable hydrogen bond networks, or thermodynamic instabilities that only surface in solution.
|
|
57
|
+
|
|
58
|
+
ProtQC combines six physics-based metrics into a composite risk score, catching high-pLDDT hallucinations that no single metric detects on its own.
|
|
59
|
+
|
|
60
|
+
## Quick Start
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
protqc analyze protein.pdb
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## The 6 Metrics
|
|
67
|
+
|
|
68
|
+
| # | Metric | Source | What It Catches |
|
|
69
|
+
|---|--------|--------|-----------------|
|
|
70
|
+
| 1 | **pLDDT** | Structure prediction | Low confidence regions |
|
|
71
|
+
| 2 | **MD RMSD** | OpenMM | Backbone instability under simulation |
|
|
72
|
+
| 3 | **Cavity Volume** | fpocket | Internal voids and packing defects |
|
|
73
|
+
| 4 | **H-bond Persistence** | MDTraj | Weak hydrogen bond networks |
|
|
74
|
+
| 5 | **SS Preservation** | MDTraj DSSP | Secondary structure loss during MD |
|
|
75
|
+
| 6 | **SASA Polar Ratio** | FreeSASA | Abnormal surface accessibility |
|
|
76
|
+
|
|
77
|
+
Each metric produces a normalized 0–1 sub-score. The composite risk score is a weighted sum, mapped to a verdict:
|
|
78
|
+
|
|
79
|
+
- **PASS** (risk < 0.30) — Design is physically plausible
|
|
80
|
+
- **WARNING** (0.30 ≤ risk < 0.50) — Proceed with caution; review flagged metrics
|
|
81
|
+
- **FAIL** (risk ≥ 0.50) — Design has significant structural issues
|
|
82
|
+
|
|
83
|
+
### Risk Scoring Weights
|
|
84
|
+
|
|
85
|
+
```yaml
|
|
86
|
+
risk_weights:
|
|
87
|
+
plddt: 0.12
|
|
88
|
+
md_rmsd: 0.29
|
|
89
|
+
cavity: 0.12
|
|
90
|
+
hbond_persistence: 0.24
|
|
91
|
+
ss_preservation: 0.18
|
|
92
|
+
sasa_ratio: 0.05
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Validated Results
|
|
96
|
+
|
|
97
|
+
| Protein | Verdict | Risk Score |
|
|
98
|
+
|---------|---------|------------|
|
|
99
|
+
| Ubiquitin (1UBQ) | PASS | 0.257 |
|
|
100
|
+
| GFP (1EMA) | PASS | 0.281 |
|
|
101
|
+
| Alpha-synuclein (1XQ8) | FAIL | 0.555 |
|
|
102
|
+
|
|
103
|
+
### Performance
|
|
104
|
+
|
|
105
|
+
| Protein | MD Duration | Wall Time | GPU |
|
|
106
|
+
|---------|-------------|-----------|-----|
|
|
107
|
+
| Ubiquitin (76 aa) | 10 ns | ~23 min | RTX 4070 |
|
|
108
|
+
| GFP (238 aa) | 10 ns | ~49 min | RTX 4070 |
|
|
109
|
+
|
|
110
|
+
## Usage
|
|
111
|
+
|
|
112
|
+
ProtQC provides three usage modes:
|
|
113
|
+
|
|
114
|
+
### CLI — Single Protein Analysis
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
# Analyze a PDB file
|
|
118
|
+
protqc analyze protein.pdb
|
|
119
|
+
|
|
120
|
+
# Enter a PDB ID — auto-downloads from RCSB
|
|
121
|
+
protqc analyze 1UBQ
|
|
122
|
+
|
|
123
|
+
# Skip MD simulation for quick structural checks
|
|
124
|
+
protqc analyze protein.pdb --skip-md
|
|
125
|
+
|
|
126
|
+
# Set MD simulation length
|
|
127
|
+
protqc analyze protein.pdb --md-duration 10
|
|
128
|
+
|
|
129
|
+
# Use pre-computed MD trajectory
|
|
130
|
+
protqc analyze protein.pdb --trajectory md_output.csv
|
|
131
|
+
|
|
132
|
+
# Generate FastQC-style HTML report
|
|
133
|
+
protqc analyze protein.pdb --html report.html
|
|
134
|
+
|
|
135
|
+
# JSON output
|
|
136
|
+
protqc analyze protein.pdb --format json
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Interactive Mode
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
# Launch interactive prompt — guides you through analysis
|
|
143
|
+
protqc
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### AI Chat Assistant
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
# Start AI-powered chat for interpreting results
|
|
150
|
+
protqc chat
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Chat supports 8 providers via LiteLLM: **OpenAI**, **Anthropic**, **Google**, **DeepSeek**, **OpenRouter**, **Moonshot**, **MiniMax**, **Zhipu**.
|
|
154
|
+
|
|
155
|
+
## Installation
|
|
156
|
+
|
|
157
|
+
### Docker (recommended — all platforms)
|
|
158
|
+
|
|
159
|
+
Docker is the easiest way to run ProtQC with all dependencies (OpenMM, CUDA, fpocket, FreeSASA, MDTraj):
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
# Build the image
|
|
163
|
+
docker build -t protqc .
|
|
164
|
+
|
|
165
|
+
# Analyze a protein (GPU-accelerated)
|
|
166
|
+
docker run --gpus all -v $(pwd)/data:/app/data protqc analyze data/benchmark/ubiquitin.pdb
|
|
167
|
+
|
|
168
|
+
# Run with MD simulation
|
|
169
|
+
docker run --gpus all -v $(pwd)/data:/app/data protqc analyze data/benchmark/ubiquitin.pdb --md-duration 10
|
|
170
|
+
|
|
171
|
+
# CPU-only (MD will be slow)
|
|
172
|
+
docker run -v $(pwd)/data:/app/data -e CUDA_VISIBLE_DEVICES="" protqc analyze protein.pdb --skip-md
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Docker Compose:**
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
# GPU-accelerated
|
|
179
|
+
docker compose run protqc analyze data/benchmark/ubiquitin.pdb
|
|
180
|
+
|
|
181
|
+
# CPU-only variant
|
|
182
|
+
docker compose run protqc-cpu analyze data/benchmark/ubiquitin.pdb --skip-md
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
> **Note:** GPU support requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). Without a GPU, MD simulations still work but are significantly slower (~10–50x). Use `--skip-md` for quick checks without MD.
|
|
186
|
+
|
|
187
|
+
### Source install (Linux only)
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
conda create -n protqc python=3.11
|
|
191
|
+
conda activate protqc
|
|
192
|
+
|
|
193
|
+
# OpenMM from conda-forge (includes CUDA support)
|
|
194
|
+
conda install -c conda-forge openmm
|
|
195
|
+
|
|
196
|
+
# ProtQC + all dependencies
|
|
197
|
+
pip install -e '.[all]'
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
> **Platform Support:** Source installation requires Linux. OpenMM and fpocket have limited support on macOS/Windows. Use Docker on non-Linux platforms.
|
|
201
|
+
|
|
202
|
+
## Configuration
|
|
203
|
+
|
|
204
|
+
All thresholds, weights, and verdict boundaries are defined in [`configs/thresholds.yaml`](configs/thresholds.yaml). Key tunables:
|
|
205
|
+
|
|
206
|
+
- **Intrinsically disordered proteins:** Increase `physics_verifier.md_rmsd_max_angstrom` (e.g., 8.0–10.0) since higher RMSD is expected
|
|
207
|
+
- **Membrane proteins:** Adjust `surface.sasa_polar_ratio_min/max` for transmembrane segments
|
|
208
|
+
|
|
209
|
+
## Limitations
|
|
210
|
+
|
|
211
|
+
ProtQC is a rapid pre-screening tool, not a substitute for comprehensive computational or experimental validation:
|
|
212
|
+
|
|
213
|
+
- **MD simulation length.** The default 10 ns simulation is a rapid pre-screen that catches catastrophic failures (large RMSD drift, complete unfolding). Subtle instabilities — slow conformational changes, partial unfolding events, aggregation-prone intermediates — may require 100–500 ns simulations for reliable detection (Lindorff-Larsen et al. 2011; Ferruz et al. 2022). Treat a ProtQC PASS as "no obvious red flags," not "experimentally validated."
|
|
214
|
+
|
|
215
|
+
- **Cavity detection.** fpocket was designed for identifying druggable surface binding pockets, not for internal void quality control (Le Guilloux et al. 2009). The suspicious cavity flagging (volume > 800 A^3, druggability < 0.4) is a literature-informed heuristic (Schmidtke et al. 2010), not a validated structural defect detector. Combine with packing density metrics or Voronoi-based tools for higher confidence.
|
|
216
|
+
|
|
217
|
+
- **Risk score weights.** The current weights are expert estimates based on published benchmarks (Dauparas et al. 2022; Ferruz et al. 2022) and will be refined through calibration on larger, more diverse protein sets. Different protein families (membrane proteins, IDPs, repeat proteins) may need substantially different weight profiles.
|
|
218
|
+
|
|
219
|
+
## Related Tools
|
|
220
|
+
|
|
221
|
+
| Tool | Focus |
|
|
222
|
+
|------|-------|
|
|
223
|
+
| [CHAPERONg](https://github.com/paulshamrat/CHAPERONg) | Automated GROMACS MD analysis |
|
|
224
|
+
| [MolProbity](https://github.com/rlabduke/MolProbity) | Stereochemistry validation |
|
|
225
|
+
| [QMEAN](https://swissmodel.expasy.org/qmean/) | Statistical potential scoring |
|
|
226
|
+
| [VoroMQA](https://bioinformatics.lt/wtsam/voromqa) | Voronoi tessellation quality |
|
|
227
|
+
| [ProSA](https://prosa.services.came.sbg.ac.at/prosa.php) | Statistical analysis of protein structures |
|
|
228
|
+
| [ProteinDJ](https://github.com/PapenfussLab/proteindj) | AI protein design evaluation |
|
|
229
|
+
| [BinderFlow](https://github.com/cryoEM-CNIO/BinderFlow) | Binder design pipeline |
|
|
230
|
+
| [OVO](https://github.com/MSDLLCpapers/ovo) | De novo protein design ecosystem |
|
|
231
|
+
|
|
232
|
+
## Roadmap
|
|
233
|
+
|
|
234
|
+
**v0.2.0** — Benchmark dataset (25 proteins, Garcia/Hermosilla/Chevalier), Colab MCP integration, weight calibration, replica runs
|
|
235
|
+
|
|
236
|
+
**v0.3.0** — Thermal stability prediction, MultiQC-style batch reports, Nextflow/Snakemake templates, REST API
|
|
237
|
+
|
|
238
|
+
## License
|
|
239
|
+
|
|
240
|
+
MIT
|
|
241
|
+
|
|
242
|
+
## Citation
|
|
243
|
+
|
|
244
|
+
```
|
|
245
|
+
Güzel, Ö.K. (2026). ProtQC: Physics-based verification of AI-designed protein designs.
|
|
246
|
+
github.com/korayguzel/protqc
|
|
247
|
+
```
|
protqc-0.1.0/README.md
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# ProtQC
|
|
2
|
+
|
|
3
|
+
**Physics-based verification of AI-designed protein structures**
|
|
4
|
+
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://www.python.org/downloads/)
|
|
7
|
+
|
|
8
|
+
*Catches structural hallucinations before wet-lab*
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## Why ProtQC?
|
|
13
|
+
|
|
14
|
+
AI protein design tools (AlphaFold, RFdiffusion, ProteinMPNN, BoltzGen) routinely produce structures with high confidence scores (pLDDT > 90) that still fail experimentally. A protein can look perfect by pLDDT yet harbor internal voids, unstable hydrogen bond networks, or thermodynamic instabilities that only surface in solution.
|
|
15
|
+
|
|
16
|
+
ProtQC combines six physics-based metrics into a composite risk score, catching high-pLDDT hallucinations that no single metric detects on its own.
|
|
17
|
+
|
|
18
|
+
## Quick Start
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
protqc analyze protein.pdb
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## The 6 Metrics
|
|
25
|
+
|
|
26
|
+
| # | Metric | Source | What It Catches |
|
|
27
|
+
|---|--------|--------|-----------------|
|
|
28
|
+
| 1 | **pLDDT** | Structure prediction | Low confidence regions |
|
|
29
|
+
| 2 | **MD RMSD** | OpenMM | Backbone instability under simulation |
|
|
30
|
+
| 3 | **Cavity Volume** | fpocket | Internal voids and packing defects |
|
|
31
|
+
| 4 | **H-bond Persistence** | MDTraj | Weak hydrogen bond networks |
|
|
32
|
+
| 5 | **SS Preservation** | MDTraj DSSP | Secondary structure loss during MD |
|
|
33
|
+
| 6 | **SASA Polar Ratio** | FreeSASA | Abnormal surface accessibility |
|
|
34
|
+
|
|
35
|
+
Each metric produces a normalized 0–1 sub-score. The composite risk score is a weighted sum, mapped to a verdict:
|
|
36
|
+
|
|
37
|
+
- **PASS** (risk < 0.30) — Design is physically plausible
|
|
38
|
+
- **WARNING** (0.30 ≤ risk < 0.50) — Proceed with caution; review flagged metrics
|
|
39
|
+
- **FAIL** (risk ≥ 0.50) — Design has significant structural issues
|
|
40
|
+
|
|
41
|
+
### Risk Scoring Weights
|
|
42
|
+
|
|
43
|
+
```yaml
|
|
44
|
+
risk_weights:
|
|
45
|
+
plddt: 0.12
|
|
46
|
+
md_rmsd: 0.29
|
|
47
|
+
cavity: 0.12
|
|
48
|
+
hbond_persistence: 0.24
|
|
49
|
+
ss_preservation: 0.18
|
|
50
|
+
sasa_ratio: 0.05
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Validated Results
|
|
54
|
+
|
|
55
|
+
| Protein | Verdict | Risk Score |
|
|
56
|
+
|---------|---------|------------|
|
|
57
|
+
| Ubiquitin (1UBQ) | PASS | 0.257 |
|
|
58
|
+
| GFP (1EMA) | PASS | 0.281 |
|
|
59
|
+
| Alpha-synuclein (1XQ8) | FAIL | 0.555 |
|
|
60
|
+
|
|
61
|
+
### Performance
|
|
62
|
+
|
|
63
|
+
| Protein | MD Duration | Wall Time | GPU |
|
|
64
|
+
|---------|-------------|-----------|-----|
|
|
65
|
+
| Ubiquitin (76 aa) | 10 ns | ~23 min | RTX 4070 |
|
|
66
|
+
| GFP (238 aa) | 10 ns | ~49 min | RTX 4070 |
|
|
67
|
+
|
|
68
|
+
## Usage
|
|
69
|
+
|
|
70
|
+
ProtQC provides three usage modes:
|
|
71
|
+
|
|
72
|
+
### CLI — Single Protein Analysis
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# Analyze a PDB file
|
|
76
|
+
protqc analyze protein.pdb
|
|
77
|
+
|
|
78
|
+
# Enter a PDB ID — auto-downloads from RCSB
|
|
79
|
+
protqc analyze 1UBQ
|
|
80
|
+
|
|
81
|
+
# Skip MD simulation for quick structural checks
|
|
82
|
+
protqc analyze protein.pdb --skip-md
|
|
83
|
+
|
|
84
|
+
# Set MD simulation length
|
|
85
|
+
protqc analyze protein.pdb --md-duration 10
|
|
86
|
+
|
|
87
|
+
# Use pre-computed MD trajectory
|
|
88
|
+
protqc analyze protein.pdb --trajectory md_output.csv
|
|
89
|
+
|
|
90
|
+
# Generate FastQC-style HTML report
|
|
91
|
+
protqc analyze protein.pdb --html report.html
|
|
92
|
+
|
|
93
|
+
# JSON output
|
|
94
|
+
protqc analyze protein.pdb --format json
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Interactive Mode
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
# Launch interactive prompt — guides you through analysis
|
|
101
|
+
protqc
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### AI Chat Assistant
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
# Start AI-powered chat for interpreting results
|
|
108
|
+
protqc chat
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Chat supports 8 providers via LiteLLM: **OpenAI**, **Anthropic**, **Google**, **DeepSeek**, **OpenRouter**, **Moonshot**, **MiniMax**, **Zhipu**.
|
|
112
|
+
|
|
113
|
+
## Installation
|
|
114
|
+
|
|
115
|
+
### Docker (recommended — all platforms)
|
|
116
|
+
|
|
117
|
+
Docker is the easiest way to run ProtQC with all dependencies (OpenMM, CUDA, fpocket, FreeSASA, MDTraj):
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
# Build the image
|
|
121
|
+
docker build -t protqc .
|
|
122
|
+
|
|
123
|
+
# Analyze a protein (GPU-accelerated)
|
|
124
|
+
docker run --gpus all -v $(pwd)/data:/app/data protqc analyze data/benchmark/ubiquitin.pdb
|
|
125
|
+
|
|
126
|
+
# Run with MD simulation
|
|
127
|
+
docker run --gpus all -v $(pwd)/data:/app/data protqc analyze data/benchmark/ubiquitin.pdb --md-duration 10
|
|
128
|
+
|
|
129
|
+
# CPU-only (MD will be slow)
|
|
130
|
+
docker run -v $(pwd)/data:/app/data -e CUDA_VISIBLE_DEVICES="" protqc analyze protein.pdb --skip-md
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
**Docker Compose:**
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
# GPU-accelerated
|
|
137
|
+
docker compose run protqc analyze data/benchmark/ubiquitin.pdb
|
|
138
|
+
|
|
139
|
+
# CPU-only variant
|
|
140
|
+
docker compose run protqc-cpu analyze data/benchmark/ubiquitin.pdb --skip-md
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
> **Note:** GPU support requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). Without a GPU, MD simulations still work but are significantly slower (~10–50x). Use `--skip-md` for quick checks without MD.
|
|
144
|
+
|
|
145
|
+
### Source install (Linux only)
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
conda create -n protqc python=3.11
|
|
149
|
+
conda activate protqc
|
|
150
|
+
|
|
151
|
+
# OpenMM from conda-forge (includes CUDA support)
|
|
152
|
+
conda install -c conda-forge openmm
|
|
153
|
+
|
|
154
|
+
# ProtQC + all dependencies
|
|
155
|
+
pip install -e '.[all]'
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
> **Platform Support:** Source installation requires Linux. OpenMM and fpocket have limited support on macOS/Windows. Use Docker on non-Linux platforms.
|
|
159
|
+
|
|
160
|
+
## Configuration
|
|
161
|
+
|
|
162
|
+
All thresholds, weights, and verdict boundaries are defined in [`configs/thresholds.yaml`](configs/thresholds.yaml). Key tunables:
|
|
163
|
+
|
|
164
|
+
- **Intrinsically disordered proteins:** Increase `physics_verifier.md_rmsd_max_angstrom` (e.g., 8.0–10.0) since higher RMSD is expected
|
|
165
|
+
- **Membrane proteins:** Adjust `surface.sasa_polar_ratio_min/max` for transmembrane segments
|
|
166
|
+
|
|
167
|
+
## Limitations
|
|
168
|
+
|
|
169
|
+
ProtQC is a rapid pre-screening tool, not a substitute for comprehensive computational or experimental validation:
|
|
170
|
+
|
|
171
|
+
- **MD simulation length.** The default 10 ns simulation is a rapid pre-screen that catches catastrophic failures (large RMSD drift, complete unfolding). Subtle instabilities — slow conformational changes, partial unfolding events, aggregation-prone intermediates — may require 100–500 ns simulations for reliable detection (Lindorff-Larsen et al. 2011; Ferruz et al. 2022). Treat a ProtQC PASS as "no obvious red flags," not "experimentally validated."
|
|
172
|
+
|
|
173
|
+
- **Cavity detection.** fpocket was designed for identifying druggable surface binding pockets, not for internal void quality control (Le Guilloux et al. 2009). The suspicious cavity flagging (volume > 800 A^3, druggability < 0.4) is a literature-informed heuristic (Schmidtke et al. 2010), not a validated structural defect detector. Combine with packing density metrics or Voronoi-based tools for higher confidence.
|
|
174
|
+
|
|
175
|
+
- **Risk score weights.** The current weights are expert estimates based on published benchmarks (Dauparas et al. 2022; Ferruz et al. 2022) and will be refined through calibration on larger, more diverse protein sets. Different protein families (membrane proteins, IDPs, repeat proteins) may need substantially different weight profiles.
|
|
176
|
+
|
|
177
|
+
## Related Tools
|
|
178
|
+
|
|
179
|
+
| Tool | Focus |
|
|
180
|
+
|------|-------|
|
|
181
|
+
| [CHAPERONg](https://github.com/paulshamrat/CHAPERONg) | Automated GROMACS MD analysis |
|
|
182
|
+
| [MolProbity](https://github.com/rlabduke/MolProbity) | Stereochemistry validation |
|
|
183
|
+
| [QMEAN](https://swissmodel.expasy.org/qmean/) | Statistical potential scoring |
|
|
184
|
+
| [VoroMQA](https://bioinformatics.lt/wtsam/voromqa) | Voronoi tessellation quality |
|
|
185
|
+
| [ProSA](https://prosa.services.came.sbg.ac.at/prosa.php) | Statistical analysis of protein structures |
|
|
186
|
+
| [ProteinDJ](https://github.com/PapenfussLab/proteindj) | AI protein design evaluation |
|
|
187
|
+
| [BinderFlow](https://github.com/cryoEM-CNIO/BinderFlow) | Binder design pipeline |
|
|
188
|
+
| [OVO](https://github.com/MSDLLCpapers/ovo) | De novo protein design ecosystem |
|
|
189
|
+
|
|
190
|
+
## Roadmap
|
|
191
|
+
|
|
192
|
+
**v0.2.0** — Benchmark dataset (25 proteins, Garcia/Hermosilla/Chevalier), Colab MCP integration, weight calibration, replica runs
|
|
193
|
+
|
|
194
|
+
**v0.3.0** — Thermal stability prediction, MultiQC-style batch reports, Nextflow/Snakemake templates, REST API
|
|
195
|
+
|
|
196
|
+
## License
|
|
197
|
+
|
|
198
|
+
MIT
|
|
199
|
+
|
|
200
|
+
## Citation
|
|
201
|
+
|
|
202
|
+
```
|
|
203
|
+
Güzel, Ö.K. (2026). ProtQC: Physics-based verification of AI-designed protein designs.
|
|
204
|
+
github.com/korayguzel/protqc
|
|
205
|
+
```
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ProtQC — Physics-based verification of AI-generated protein designs.
|
|
3
|
+
|
|
4
|
+
A multi-agent framework that catches structural hallucinations before wet-lab.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
|
|
9
|
+
from protqc.types import (
|
|
10
|
+
ToolResult,
|
|
11
|
+
VerificationMetrics,
|
|
12
|
+
RiskVerdict,
|
|
13
|
+
PipelineResult,
|
|
14
|
+
)
|
|
15
|
+
from protqc.config import load_config
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"ToolResult",
|
|
19
|
+
"VerificationMetrics",
|
|
20
|
+
"RiskVerdict",
|
|
21
|
+
"PipelineResult",
|
|
22
|
+
"load_config",
|
|
23
|
+
]
|