debase 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase-0.1.0/.gitignore +177 -0
- debase-0.1.0/CONTRIBUTING.md +61 -0
- debase-0.1.0/LICENSE +21 -0
- debase-0.1.0/MANIFEST.in +9 -0
- debase-0.1.0/PKG-INFO +299 -0
- debase-0.1.0/README.md +242 -0
- debase-0.1.0/docs/README.md +19 -0
- debase-0.1.0/docs/examples/README.md +24 -0
- debase-0.1.0/environment.yml +21 -0
- debase-0.1.0/pyproject.toml +90 -0
- debase-0.1.0/setup.cfg +4 -0
- debase-0.1.0/setup.py +60 -0
- debase-0.1.0/src/__init__.py +1 -0
- debase-0.1.0/src/debase/PIPELINE_FLOW.md +100 -0
- debase-0.1.0/src/debase/__init__.py +18 -0
- debase-0.1.0/src/debase/__main__.py +9 -0
- debase-0.1.0/src/debase/_version.py +3 -0
- debase-0.1.0/src/debase/build_db.py +190 -0
- debase-0.1.0/src/debase/cleanup_sequence.py +905 -0
- debase-0.1.0/src/debase/enzyme_lineage_extractor.py +2169 -0
- debase-0.1.0/src/debase/lineage_format.py +808 -0
- debase-0.1.0/src/debase/reaction_info_extractor.py +2331 -0
- debase-0.1.0/src/debase/substrate_scope_extractor.py +2039 -0
- debase-0.1.0/src/debase/wrapper.py +303 -0
- debase-0.1.0/src/debase.egg-info/PKG-INFO +299 -0
- debase-0.1.0/src/debase.egg-info/SOURCES.txt +28 -0
- debase-0.1.0/src/debase.egg-info/dependency_links.txt +1 -0
- debase-0.1.0/src/debase.egg-info/entry_points.txt +2 -0
- debase-0.1.0/src/debase.egg-info/requires.txt +28 -0
- debase-0.1.0/src/debase.egg-info/top_level.txt +1 -0
debase-0.1.0/.gitignore
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
2
|
+
__pycache__/
|
3
|
+
*.py[cod]
|
4
|
+
*$py.class
|
5
|
+
|
6
|
+
# C extensions
|
7
|
+
*.so
|
8
|
+
|
9
|
+
# Distribution / packaging
|
10
|
+
.Python
|
11
|
+
build/
|
12
|
+
develop-eggs/
|
13
|
+
dist/
|
14
|
+
downloads/
|
15
|
+
eggs/
|
16
|
+
.eggs/
|
17
|
+
lib/
|
18
|
+
lib64/
|
19
|
+
parts/
|
20
|
+
sdist/
|
21
|
+
var/
|
22
|
+
wheels/
|
23
|
+
share/python-wheels/
|
24
|
+
*.egg-info/
|
25
|
+
.installed.cfg
|
26
|
+
*.egg
|
27
|
+
MANIFEST
|
28
|
+
|
29
|
+
# PyInstaller
|
30
|
+
*.manifest
|
31
|
+
*.spec
|
32
|
+
|
33
|
+
# Installer logs
|
34
|
+
pip-log.txt
|
35
|
+
pip-delete-this-directory.txt
|
36
|
+
|
37
|
+
# Unit test / coverage reports
|
38
|
+
htmlcov/
|
39
|
+
.tox/
|
40
|
+
.nox/
|
41
|
+
.coverage
|
42
|
+
.coverage.*
|
43
|
+
.cache
|
44
|
+
nosetests.xml
|
45
|
+
coverage.xml
|
46
|
+
*.cover
|
47
|
+
*.py,cover
|
48
|
+
.hypothesis/
|
49
|
+
.pytest_cache/
|
50
|
+
cover/
|
51
|
+
|
52
|
+
# Jupyter Notebook
|
53
|
+
.ipynb_checkpoints
|
54
|
+
|
55
|
+
# IPython
|
56
|
+
profile_default/
|
57
|
+
ipython_config.py
|
58
|
+
|
59
|
+
# pyenv
|
60
|
+
.python-version
|
61
|
+
|
62
|
+
# pipenv
|
63
|
+
Pipfile.lock
|
64
|
+
|
65
|
+
# poetry
|
66
|
+
poetry.lock
|
67
|
+
|
68
|
+
# pdm
|
69
|
+
.pdm.toml
|
70
|
+
|
71
|
+
# PEP 582
|
72
|
+
__pypackages__/
|
73
|
+
|
74
|
+
# Celery stuff
|
75
|
+
celerybeat-schedule
|
76
|
+
celerybeat.pid
|
77
|
+
|
78
|
+
# SageMath parsed files
|
79
|
+
*.sage.py
|
80
|
+
|
81
|
+
# Environments
|
82
|
+
.env
|
83
|
+
.venv
|
84
|
+
env/
|
85
|
+
venv/
|
86
|
+
ENV/
|
87
|
+
env.bak/
|
88
|
+
venv.bak/
|
89
|
+
|
90
|
+
# Spyder project settings
|
91
|
+
.spyderproject
|
92
|
+
.spyproject
|
93
|
+
|
94
|
+
# Rope project settings
|
95
|
+
.ropeproject
|
96
|
+
|
97
|
+
# mkdocs documentation
|
98
|
+
/site
|
99
|
+
|
100
|
+
# mypy
|
101
|
+
.mypy_cache/
|
102
|
+
.dmypy.json
|
103
|
+
dmypy.json
|
104
|
+
|
105
|
+
# Pyre type checker
|
106
|
+
.pyre/
|
107
|
+
|
108
|
+
# pytype static type analyzer
|
109
|
+
.pytype/
|
110
|
+
|
111
|
+
# Cython debug symbols
|
112
|
+
cython_debug/
|
113
|
+
|
114
|
+
# PyCharm
|
115
|
+
.idea/
|
116
|
+
|
117
|
+
# VS Code
|
118
|
+
.vscode/
|
119
|
+
|
120
|
+
# macOS
|
121
|
+
.DS_Store
|
122
|
+
.AppleDouble
|
123
|
+
.LSOverride
|
124
|
+
|
125
|
+
# Windows
|
126
|
+
Thumbs.db
|
127
|
+
Thumbs.db:encryptable
|
128
|
+
ehthumbs.db
|
129
|
+
ehthumbs_vista.db
|
130
|
+
*.stackdump
|
131
|
+
[Dd]esktop.ini
|
132
|
+
$RECYCLE.BIN/
|
133
|
+
*.cab
|
134
|
+
*.msi
|
135
|
+
*.msix
|
136
|
+
*.msm
|
137
|
+
*.msp
|
138
|
+
*.lnk
|
139
|
+
|
140
|
+
# Linux
|
141
|
+
*~
|
142
|
+
|
143
|
+
# Temporary files
|
144
|
+
*.tmp
|
145
|
+
*.temp
|
146
|
+
*.log
|
147
|
+
.temp_*/
|
148
|
+
.cache/
|
149
|
+
|
150
|
+
# DEBase specific
|
151
|
+
enzyme_pipeline*.log
|
152
|
+
temp_merged_input.csv
|
153
|
+
*.egg-info/
|
154
|
+
|
155
|
+
# Project data and examples
|
156
|
+
data/
|
157
|
+
examples/
|
158
|
+
!examples/test.csv # Keep test.csv as example output
|
159
|
+
|
160
|
+
# Cache files
|
161
|
+
*.pkl
|
162
|
+
*_cache.pkl
|
163
|
+
|
164
|
+
# Large database files
|
165
|
+
*.db
|
166
|
+
|
167
|
+
# PDFs and Excel files
|
168
|
+
*.pdf
|
169
|
+
*.xlsx
|
170
|
+
|
171
|
+
# Backup files
|
172
|
+
*_backup.py
|
173
|
+
lineage_format_backup.py
|
174
|
+
|
175
|
+
# Temporary directories
|
176
|
+
.temp_*
|
177
|
+
enzyme_analysis_*
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Contributing to DEBase
|
2
|
+
|
3
|
+
Thank you for your interest in contributing to DEBase!
|
4
|
+
|
5
|
+
## Development Setup
|
6
|
+
|
7
|
+
1. Clone the repository:
|
8
|
+
```bash
|
9
|
+
git clone https://github.com/yourusername/debase.git
|
10
|
+
cd debase
|
11
|
+
```
|
12
|
+
|
13
|
+
2. Create a virtual environment:
|
14
|
+
```bash
|
15
|
+
python -m venv venv
|
16
|
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
17
|
+
```
|
18
|
+
|
19
|
+
3. Install in development mode:
|
20
|
+
```bash
|
21
|
+
pip install -e ".[dev]"
|
22
|
+
```
|
23
|
+
|
24
|
+
## Running Tests
|
25
|
+
|
26
|
+
```bash
|
27
|
+
pytest tests/
|
28
|
+
```
|
29
|
+
|
30
|
+
## Code Style
|
31
|
+
|
32
|
+
We use Black for code formatting:
|
33
|
+
```bash
|
34
|
+
black src/ tests/
|
35
|
+
```
|
36
|
+
|
37
|
+
And isort for import sorting:
|
38
|
+
```bash
|
39
|
+
isort src/ tests/
|
40
|
+
```
|
41
|
+
|
42
|
+
## Project Structure
|
43
|
+
|
44
|
+
```
|
45
|
+
debase/
|
46
|
+
├── src/debase/ # Main package source code
|
47
|
+
├── tests/ # Test suite
|
48
|
+
├── docs/ # Documentation
|
49
|
+
├── examples/ # Example outputs and usage
|
50
|
+
├── data/ # Research data (PDFs)
|
51
|
+
└── scripts/ # Utility scripts
|
52
|
+
```
|
53
|
+
|
54
|
+
## Submitting Changes
|
55
|
+
|
56
|
+
1. Fork the repository
|
57
|
+
2. Create a feature branch
|
58
|
+
3. Make your changes
|
59
|
+
4. Add tests if applicable
|
60
|
+
5. Run the test suite
|
61
|
+
6. Submit a pull request
|
debase-0.1.0/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 DEBase Contributors
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
debase-0.1.0/MANIFEST.in
ADDED
debase-0.1.0/PKG-INFO
ADDED
@@ -0,0 +1,299 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: debase
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Enzyme lineage analysis and sequence extraction package
|
5
|
+
Home-page: https://github.com/YuemingLong/DEBase
|
6
|
+
Author: DEBase Team
|
7
|
+
Author-email: DEBase Team <ylong@caltech.edu>
|
8
|
+
License: MIT
|
9
|
+
Project-URL: Homepage, https://github.com/YuemingLong/DEBase
|
10
|
+
Project-URL: Documentation, https://github.com/YuemingLong/DEBase#readme
|
11
|
+
Project-URL: Repository, https://github.com/YuemingLong/DEBase
|
12
|
+
Project-URL: Issues, https://github.com/YuemingLong/DEBase/issues
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
16
|
+
Classifier: Operating System :: OS Independent
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
25
|
+
Requires-Python: >=3.8
|
26
|
+
Description-Content-Type: text/markdown
|
27
|
+
License-File: LICENSE
|
28
|
+
Requires-Dist: pandas>=1.0.0
|
29
|
+
Requires-Dist: PyMuPDF>=1.18.0
|
30
|
+
Requires-Dist: numpy>=1.19.0
|
31
|
+
Requires-Dist: google-generativeai>=0.3.0
|
32
|
+
Requires-Dist: biopython>=1.78
|
33
|
+
Requires-Dist: requests>=2.25.0
|
34
|
+
Requires-Dist: httpx>=0.24.0
|
35
|
+
Requires-Dist: tqdm>=4.60.0
|
36
|
+
Requires-Dist: openpyxl>=3.0.0
|
37
|
+
Requires-Dist: PyPDF2>=2.0.0
|
38
|
+
Requires-Dist: Pillow>=8.0.0
|
39
|
+
Requires-Dist: networkx>=2.5
|
40
|
+
Provides-Extra: rdkit
|
41
|
+
Requires-Dist: rdkit>=2020.03.1; extra == "rdkit"
|
42
|
+
Provides-Extra: dev
|
43
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
44
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
45
|
+
Requires-Dist: black; extra == "dev"
|
46
|
+
Requires-Dist: isort; extra == "dev"
|
47
|
+
Requires-Dist: flake8; extra == "dev"
|
48
|
+
Requires-Dist: mypy; extra == "dev"
|
49
|
+
Provides-Extra: docs
|
50
|
+
Requires-Dist: sphinx>=4.0; extra == "docs"
|
51
|
+
Requires-Dist: sphinx-rtd-theme; extra == "docs"
|
52
|
+
Requires-Dist: myst-parser; extra == "docs"
|
53
|
+
Dynamic: author
|
54
|
+
Dynamic: home-page
|
55
|
+
Dynamic: license-file
|
56
|
+
Dynamic: requires-python
|
57
|
+
|
58
|
+
# DEBase
|
59
|
+
|
60
|
+
Enzyme lineage analysis and sequence extraction package with advanced parallel processing capabilities.
|
61
|
+
|
62
|
+
## Installation
|
63
|
+
|
64
|
+
```bash
|
65
|
+
pip install debase
|
66
|
+
```
|
67
|
+
|
68
|
+
For full functionality with chemical SMILES support:
|
69
|
+
|
70
|
+
```bash
|
71
|
+
pip install debase[rdkit]
|
72
|
+
```
|
73
|
+
|
74
|
+
## Requirements
|
75
|
+
|
76
|
+
- Python 3.8 or higher
|
77
|
+
- A Gemini API key (set as environment variable `GEMINI_API_KEY`)
|
78
|
+
|
79
|
+
## Recent Updates
|
80
|
+
|
81
|
+
- **Campaign-Aware Extraction**: Automatically detects and processes multiple directed evolution campaigns in a single paper
|
82
|
+
- **Improved Model Support**: Updated to use stable Gemini models for better reliability
|
83
|
+
- **Enhanced PDB Integration**: Intelligent AI-based matching of PDB structures to enzyme variants
|
84
|
+
- **Better Filtering**: Automatic removal of non-enzyme entries (buffers, controls, media)
|
85
|
+
- **Optimized Performance**: Removed unnecessary rate limiting for faster processing
|
86
|
+
- **External Sequence Fetching**: Automatic retrieval from PDB and UniProt databases when sequences aren't in papers
|
87
|
+
- **Improved SI Processing**: Structure-aware extraction of supplementary information
|
88
|
+
- **Vision Support**: Extracts data from figures and tables using multimodal AI capabilities
|
89
|
+
|
90
|
+
## Quick Start
|
91
|
+
|
92
|
+
### Basic Usage
|
93
|
+
```bash
|
94
|
+
# Run the full pipeline (sequential processing)
|
95
|
+
debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv
|
96
|
+
```
|
97
|
+
|
98
|
+
### High-Performance Parallel Processing
|
99
|
+
```bash
|
100
|
+
# Use parallel individual processing for maximum speed + accuracy
|
101
|
+
debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv \
|
102
|
+
--use-parallel-individual --max-workers 5
|
103
|
+
|
104
|
+
# Use batch processing for maximum speed (slight accuracy trade-off)
|
105
|
+
debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv \
|
106
|
+
--use-optimized-reaction --reaction-batch-size 5
|
107
|
+
```
|
108
|
+
|
109
|
+
## Processing Methods
|
110
|
+
|
111
|
+
DEBase offers three processing approaches optimized for different use cases:
|
112
|
+
|
113
|
+
### 1. **Parallel Individual Processing** (Recommended)
|
114
|
+
- **42 individual API calls** (21 for reactions + 21 for substrate scope)
|
115
|
+
- **5 calls running simultaneously** for 4-5x speedup
|
116
|
+
- **Maximum accuracy** - each enzyme gets dedicated attention
|
117
|
+
- **Best for:** Production use, important analyses
|
118
|
+
|
119
|
+
```bash
|
120
|
+
debase --manuscript paper.pdf --si si.pdf --use-parallel-individual --max-workers 5
|
121
|
+
```
|
122
|
+
|
123
|
+
### 2. **Batch Processing** (Fastest)
|
124
|
+
- **~8 total API calls** (multiple enzymes per call)
|
125
|
+
- **Fastest processing** - up to 8x speedup
|
126
|
+
- **Good accuracy** - slight trade-off for complex chemical names
|
127
|
+
- **Best for:** Quick analyses, large-scale processing
|
128
|
+
|
129
|
+
```bash
|
130
|
+
debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-batch-size 5
|
131
|
+
```
|
132
|
+
|
133
|
+
### 3. **Sequential Processing** (Most Accurate)
|
134
|
+
- **42 sequential API calls** (one at a time)
|
135
|
+
- **Highest accuracy** but slowest
|
136
|
+
- **Best for:** Critical analyses, small datasets
|
137
|
+
|
138
|
+
```bash
|
139
|
+
debase --manuscript paper.pdf --si si.pdf # Default method
|
140
|
+
```
|
141
|
+
|
142
|
+
## Performance Comparison
|
143
|
+
|
144
|
+
| Method | Total Time | API Calls | Accuracy | Best For |
|
145
|
+
|--------|------------|-----------|----------|----------|
|
146
|
+
| Sequential | ~45 min | 44 calls | Highest | Small datasets |
|
147
|
+
| **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
|
148
|
+
| Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
|
149
|
+
|
150
|
+
## Advanced Usage
|
151
|
+
|
152
|
+
### Skip Steps with Existing Data
|
153
|
+
```bash
|
154
|
+
# Skip lineage extraction if you already have it
|
155
|
+
debase --manuscript paper.pdf --si si.pdf --output output.csv \
|
156
|
+
--skip-lineage --existing-lineage existing_lineage.csv \
|
157
|
+
--use-parallel-individual
|
158
|
+
```
|
159
|
+
|
160
|
+
### Direct Module Usage
|
161
|
+
```bash
|
162
|
+
# Run only reaction extraction with parallel processing
|
163
|
+
python -m debase.reaction_info_extractor_parallel \
|
164
|
+
--manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
|
165
|
+
--max-workers 5 --output reactions.csv
|
166
|
+
|
167
|
+
# Run only substrate scope extraction with parallel processing
|
168
|
+
python -m debase.substrate_scope_extractor_parallel \
|
169
|
+
--manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
|
170
|
+
--max-workers 5 --output substrate_scope.csv
|
171
|
+
```
|
172
|
+
|
173
|
+
## Python API
|
174
|
+
|
175
|
+
```python
|
176
|
+
from debase.wrapper import run_pipeline
|
177
|
+
|
178
|
+
# Run full pipeline with parallel processing
|
179
|
+
run_pipeline(
|
180
|
+
manuscript_path="paper.pdf",
|
181
|
+
si_path="si.pdf",
|
182
|
+
output="output.csv",
|
183
|
+
use_parallel_individual=True,
|
184
|
+
max_workers=5
|
185
|
+
)
|
186
|
+
|
187
|
+
# For individual steps
|
188
|
+
from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
|
189
|
+
from debase.enzyme_lineage_extractor import setup_gemini_api
|
190
|
+
|
191
|
+
model = setup_gemini_api()
|
192
|
+
reaction_data = extract_reaction_info_parallel(
|
193
|
+
model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
|
194
|
+
)
|
195
|
+
```
|
196
|
+
|
197
|
+
## Pipeline Architecture
|
198
|
+
|
199
|
+
The DEBase pipeline consists of 5 main steps:
|
200
|
+
|
201
|
+
1. **Lineage Extraction** (Sequential) - Identifies all enzymes and their relationships
|
202
|
+
- Extracts mutation information and evolutionary paths
|
203
|
+
- Detects multiple directed evolution campaigns automatically
|
204
|
+
- Fetches sequences from external databases (PDB, UniProt)
|
205
|
+
- Filters out non-enzyme entries automatically
|
206
|
+
2. **Sequence Cleanup** (Local) - Generates protein sequences from mutations
|
207
|
+
- Applies mutations to parent sequences
|
208
|
+
- Handles complex mutations and domain modifications
|
209
|
+
- Validates sequence integrity
|
210
|
+
3. **Reaction Extraction** (Parallel/Batch/Sequential) - Extracts reaction conditions and performance data
|
211
|
+
- Campaign-aware extraction for multi-lineage papers
|
212
|
+
- Vision-based extraction from figures and tables
|
213
|
+
- Automatic IUPAC name resolution
|
214
|
+
4. **Substrate Scope Extraction** (Parallel/Sequential) - Finds additional substrates tested
|
215
|
+
5. **Data Formatting** (Local) - Combines all data into final output
|
216
|
+
|
217
|
+
## Features
|
218
|
+
|
219
|
+
- **Multi-processing modes:** Sequential, parallel individual, and batch processing
|
220
|
+
- **Campaign detection:** Automatically identifies and separates multiple directed evolution campaigns
|
221
|
+
- **Intelligent error handling:** Automatic retries with exponential backoff
|
222
|
+
- **External database integration:** Automatic sequence fetching from PDB and UniProt
|
223
|
+
- **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
|
224
|
+
- **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
|
225
|
+
- **Progress tracking:** Real-time status updates
|
226
|
+
- **Flexible output:** CSV format with comprehensive chemical and performance data
|
227
|
+
- **Caching:** PDF encoding cache for improved performance
|
228
|
+
- **Vision capabilities:** Extracts data from both text and images in PDFs
|
229
|
+
|
230
|
+
## Complete Command Reference
|
231
|
+
|
232
|
+
### Core Arguments
|
233
|
+
```bash
|
234
|
+
--manuscript PATH # Required: Path to manuscript PDF
|
235
|
+
--si PATH # Optional: Path to supplementary information PDF
|
236
|
+
--output PATH # Output file path (default: manuscript_name_debase.csv)
|
237
|
+
--queries N # Number of consensus queries (default: 2)
|
238
|
+
```
|
239
|
+
|
240
|
+
### Performance Options
|
241
|
+
```bash
|
242
|
+
--use-parallel-individual # Use parallel processing (recommended)
|
243
|
+
--max-workers N # Number of parallel workers (default: 5)
|
244
|
+
--use-optimized-reaction # Use batch processing for speed
|
245
|
+
--reaction-batch-size N # Enzymes per batch (default: 5)
|
246
|
+
--no-parallel-queries # Disable parallel processing
|
247
|
+
```
|
248
|
+
|
249
|
+
### Pipeline Control
|
250
|
+
```bash
|
251
|
+
--skip-lineage # Skip lineage extraction step
|
252
|
+
--skip-sequence # Skip sequence cleanup step
|
253
|
+
--skip-reaction # Skip reaction extraction step
|
254
|
+
--skip-substrate-scope # Skip substrate scope extraction step
|
255
|
+
--skip-lineage-format # Skip final formatting step
|
256
|
+
--skip-validation # Skip data validation step
|
257
|
+
```
|
258
|
+
|
259
|
+
### Data Management
|
260
|
+
```bash
|
261
|
+
--existing-lineage PATH # Use existing lineage data
|
262
|
+
--existing-sequence PATH # Use existing sequence data
|
263
|
+
--existing-reaction PATH # Use existing reaction data
|
264
|
+
--keep-intermediates # Preserve intermediate files
|
265
|
+
```
|
266
|
+
|
267
|
+
### Advanced Options
|
268
|
+
```bash
|
269
|
+
--model-name NAME # Gemini model to use
|
270
|
+
--max-retries N # Maximum retry attempts (default: 2)
|
271
|
+
--max-chars N # Max characters from PDFs (default: 75000)
|
272
|
+
--debug-dir PATH # Directory for debug output (prompts, API responses)
|
273
|
+
```
|
274
|
+
|
275
|
+
## Tips for Best Performance
|
276
|
+
|
277
|
+
1. **Use parallel individual processing** for the best balance of speed and accuracy
|
278
|
+
2. **Set max-workers to 5** to avoid API rate limits while maximizing throughput
|
279
|
+
3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
|
280
|
+
4. **Skip validation** (`--skip-validation`) for faster processing in production
|
281
|
+
5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
|
282
|
+
6. **Check external databases** - Many sequences can be automatically fetched from PDB/UniProt
|
283
|
+
7. **Verify enzyme entries** - The system automatically filters out buffers and controls
|
284
|
+
|
285
|
+
## Troubleshooting
|
286
|
+
|
287
|
+
### No sequences found
|
288
|
+
- The extractor will automatically search PDB and UniProt databases
|
289
|
+
- Check the logs for which database IDs were found and attempted
|
290
|
+
- Sequences with PDB structures will be fetched with high confidence
|
291
|
+
|
292
|
+
### Incorrect enzyme extraction
|
293
|
+
- Non-enzyme entries (buffers, controls, media) are automatically filtered
|
294
|
+
- Check the log for entries marked as "Filtering out non-enzyme entry"
|
295
|
+
|
296
|
+
### PDB matching issues
|
297
|
+
- The system uses AI to match PDB IDs to specific enzyme variants
|
298
|
+
- Increased context extraction ensures better matching accuracy
|
299
|
+
- Check logs for "Gemini PDB matching" entries to see the matching process
|