debase 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase-0.1.3/.gitignore +177 -0
- debase-0.1.3/CONTRIBUTING.md +61 -0
- {debase-0.1.1/src/debase.egg-info → debase-0.1.3}/PKG-INFO +2 -61
- {debase-0.1.1 → debase-0.1.3}/README.md +1 -60
- debase-0.1.3/docs/README.md +19 -0
- debase-0.1.3/docs/examples/README.md +24 -0
- debase-0.1.3/environment.yml +21 -0
- debase-0.1.3/src/debase/PIPELINE_FLOW.md +100 -0
- {debase-0.1.1 → debase-0.1.3}/src/debase/_version.py +1 -1
- {debase-0.1.1 → debase-0.1.3}/src/debase/enzyme_lineage_extractor.py +10 -1
- {debase-0.1.1 → debase-0.1.3}/src/debase/reaction_info_extractor.py +52 -7
- {debase-0.1.1 → debase-0.1.3/src/debase.egg-info}/PKG-INFO +2 -61
- {debase-0.1.1 → debase-0.1.3}/src/debase.egg-info/SOURCES.txt +7 -0
- debase-0.1.3/src/debase.egg-info/dependency_links.txt +1 -0
- {debase-0.1.1 → debase-0.1.3}/LICENSE +0 -0
- {debase-0.1.1 → debase-0.1.3}/MANIFEST.in +0 -0
- {debase-0.1.1 → debase-0.1.3}/pyproject.toml +0 -0
- {debase-0.1.1 → debase-0.1.3}/setup.cfg +0 -0
- {debase-0.1.1 → debase-0.1.3}/setup.py +0 -0
- /debase-0.1.1/src/debase.egg-info/dependency_links.txt → /debase-0.1.3/src/__init__.py +0 -0
- {debase-0.1.1 → debase-0.1.3}/src/debase/__init__.py +0 -0
- {debase-0.1.1 → debase-0.1.3}/src/debase/__main__.py +0 -0
- {debase-0.1.1 → debase-0.1.3}/src/debase/build_db.py +0 -0
- {debase-0.1.1 → debase-0.1.3}/src/debase/cleanup_sequence.py +0 -0
- {debase-0.1.1 → debase-0.1.3}/src/debase/lineage_format.py +0 -0
- {debase-0.1.1 → debase-0.1.3}/src/debase/substrate_scope_extractor.py +0 -0
- {debase-0.1.1 → debase-0.1.3}/src/debase/wrapper.py +0 -0
- {debase-0.1.1 → debase-0.1.3}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.1.1 → debase-0.1.3}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.1.1 → debase-0.1.3}/src/debase.egg-info/top_level.txt +0 -0
debase-0.1.3/.gitignore
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
2
|
+
__pycache__/
|
3
|
+
*.py[cod]
|
4
|
+
*$py.class
|
5
|
+
|
6
|
+
# C extensions
|
7
|
+
*.so
|
8
|
+
|
9
|
+
# Distribution / packaging
|
10
|
+
.Python
|
11
|
+
build/
|
12
|
+
develop-eggs/
|
13
|
+
dist/
|
14
|
+
downloads/
|
15
|
+
eggs/
|
16
|
+
.eggs/
|
17
|
+
lib/
|
18
|
+
lib64/
|
19
|
+
parts/
|
20
|
+
sdist/
|
21
|
+
var/
|
22
|
+
wheels/
|
23
|
+
share/python-wheels/
|
24
|
+
*.egg-info/
|
25
|
+
.installed.cfg
|
26
|
+
*.egg
|
27
|
+
MANIFEST
|
28
|
+
|
29
|
+
# PyInstaller
|
30
|
+
*.manifest
|
31
|
+
*.spec
|
32
|
+
|
33
|
+
# Installer logs
|
34
|
+
pip-log.txt
|
35
|
+
pip-delete-this-directory.txt
|
36
|
+
|
37
|
+
# Unit test / coverage reports
|
38
|
+
htmlcov/
|
39
|
+
.tox/
|
40
|
+
.nox/
|
41
|
+
.coverage
|
42
|
+
.coverage.*
|
43
|
+
.cache
|
44
|
+
nosetests.xml
|
45
|
+
coverage.xml
|
46
|
+
*.cover
|
47
|
+
*.py,cover
|
48
|
+
.hypothesis/
|
49
|
+
.pytest_cache/
|
50
|
+
cover/
|
51
|
+
|
52
|
+
# Jupyter Notebook
|
53
|
+
.ipynb_checkpoints
|
54
|
+
|
55
|
+
# IPython
|
56
|
+
profile_default/
|
57
|
+
ipython_config.py
|
58
|
+
|
59
|
+
# pyenv
|
60
|
+
.python-version
|
61
|
+
|
62
|
+
# pipenv
|
63
|
+
Pipfile.lock
|
64
|
+
|
65
|
+
# poetry
|
66
|
+
poetry.lock
|
67
|
+
|
68
|
+
# pdm
|
69
|
+
.pdm.toml
|
70
|
+
|
71
|
+
# PEP 582
|
72
|
+
__pypackages__/
|
73
|
+
|
74
|
+
# Celery stuff
|
75
|
+
celerybeat-schedule
|
76
|
+
celerybeat.pid
|
77
|
+
|
78
|
+
# SageMath parsed files
|
79
|
+
*.sage.py
|
80
|
+
|
81
|
+
# Environments
|
82
|
+
.env
|
83
|
+
.venv
|
84
|
+
env/
|
85
|
+
venv/
|
86
|
+
ENV/
|
87
|
+
env.bak/
|
88
|
+
venv.bak/
|
89
|
+
|
90
|
+
# Spyder project settings
|
91
|
+
.spyderproject
|
92
|
+
.spyproject
|
93
|
+
|
94
|
+
# Rope project settings
|
95
|
+
.ropeproject
|
96
|
+
|
97
|
+
# mkdocs documentation
|
98
|
+
/site
|
99
|
+
|
100
|
+
# mypy
|
101
|
+
.mypy_cache/
|
102
|
+
.dmypy.json
|
103
|
+
dmypy.json
|
104
|
+
|
105
|
+
# Pyre type checker
|
106
|
+
.pyre/
|
107
|
+
|
108
|
+
# pytype static type analyzer
|
109
|
+
.pytype/
|
110
|
+
|
111
|
+
# Cython debug symbols
|
112
|
+
cython_debug/
|
113
|
+
|
114
|
+
# PyCharm
|
115
|
+
.idea/
|
116
|
+
|
117
|
+
# VS Code
|
118
|
+
.vscode/
|
119
|
+
|
120
|
+
# macOS
|
121
|
+
.DS_Store
|
122
|
+
.AppleDouble
|
123
|
+
.LSOverride
|
124
|
+
|
125
|
+
# Windows
|
126
|
+
Thumbs.db
|
127
|
+
Thumbs.db:encryptable
|
128
|
+
ehthumbs.db
|
129
|
+
ehthumbs_vista.db
|
130
|
+
*.stackdump
|
131
|
+
[Dd]esktop.ini
|
132
|
+
$RECYCLE.BIN/
|
133
|
+
*.cab
|
134
|
+
*.msi
|
135
|
+
*.msix
|
136
|
+
*.msm
|
137
|
+
*.msp
|
138
|
+
*.lnk
|
139
|
+
|
140
|
+
# Linux
|
141
|
+
*~
|
142
|
+
|
143
|
+
# Temporary files
|
144
|
+
*.tmp
|
145
|
+
*.temp
|
146
|
+
*.log
|
147
|
+
.temp_*/
|
148
|
+
.cache/
|
149
|
+
|
150
|
+
# DEBase specific
|
151
|
+
enzyme_pipeline*.log
|
152
|
+
temp_merged_input.csv
|
153
|
+
*.egg-info/
|
154
|
+
|
155
|
+
# Project data and examples
|
156
|
+
data/
|
157
|
+
examples/
|
158
|
+
!examples/test.csv # Keep test.csv as example output
|
159
|
+
|
160
|
+
# Cache files
|
161
|
+
*.pkl
|
162
|
+
*_cache.pkl
|
163
|
+
|
164
|
+
# Large database files
|
165
|
+
*.db
|
166
|
+
|
167
|
+
# PDFs and Excel files
|
168
|
+
*.pdf
|
169
|
+
*.xlsx
|
170
|
+
|
171
|
+
# Backup files
|
172
|
+
*_backup.py
|
173
|
+
lineage_format_backup.py
|
174
|
+
|
175
|
+
# Temporary directories
|
176
|
+
.temp_*
|
177
|
+
enzyme_analysis_*
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Contributing to DEBase
|
2
|
+
|
3
|
+
Thank you for your interest in contributing to DEBase!
|
4
|
+
|
5
|
+
## Development Setup
|
6
|
+
|
7
|
+
1. Clone the repository:
|
8
|
+
```bash
|
9
|
+
git clone https://github.com/yourusername/debase.git
|
10
|
+
cd debase
|
11
|
+
```
|
12
|
+
|
13
|
+
2. Create a virtual environment:
|
14
|
+
```bash
|
15
|
+
python -m venv venv
|
16
|
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
17
|
+
```
|
18
|
+
|
19
|
+
3. Install in development mode:
|
20
|
+
```bash
|
21
|
+
pip install -e ".[dev]"
|
22
|
+
```
|
23
|
+
|
24
|
+
## Running Tests
|
25
|
+
|
26
|
+
```bash
|
27
|
+
pytest tests/
|
28
|
+
```
|
29
|
+
|
30
|
+
## Code Style
|
31
|
+
|
32
|
+
We use Black for code formatting:
|
33
|
+
```bash
|
34
|
+
black src/ tests/
|
35
|
+
```
|
36
|
+
|
37
|
+
And isort for import sorting:
|
38
|
+
```bash
|
39
|
+
isort src/ tests/
|
40
|
+
```
|
41
|
+
|
42
|
+
## Project Structure
|
43
|
+
|
44
|
+
```
|
45
|
+
debase/
|
46
|
+
├── src/debase/ # Main package source code
|
47
|
+
├── tests/ # Test suite
|
48
|
+
├── docs/ # Documentation
|
49
|
+
├── examples/ # Example outputs and usage
|
50
|
+
├── data/ # Research data (PDFs)
|
51
|
+
└── scripts/ # Utility scripts
|
52
|
+
```
|
53
|
+
|
54
|
+
## Submitting Changes
|
55
|
+
|
56
|
+
1. Fork the repository
|
57
|
+
2. Create a feature branch
|
58
|
+
3. Make your changes
|
59
|
+
4. Add tests if applicable
|
60
|
+
5. Run the test suite
|
61
|
+
6. Submit a pull request
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: debase
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.3
|
4
4
|
Summary: Enzyme lineage analysis and sequence extraction package
|
5
5
|
Home-page: https://github.com/YuemingLong/DEBase
|
6
6
|
Author: DEBase Team
|
@@ -64,13 +64,6 @@ Enzyme lineage analysis and sequence extraction package with advanced parallel p
|
|
64
64
|
```bash
|
65
65
|
pip install debase
|
66
66
|
```
|
67
|
-
|
68
|
-
For full functionality with chemical SMILES support:
|
69
|
-
|
70
|
-
```bash
|
71
|
-
pip install debase[rdkit]
|
72
|
-
```
|
73
|
-
|
74
67
|
## Requirements
|
75
68
|
|
76
69
|
- Python 3.8 or higher
|
@@ -139,13 +132,6 @@ debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-ba
|
|
139
132
|
debase --manuscript paper.pdf --si si.pdf # Default method
|
140
133
|
```
|
141
134
|
|
142
|
-
## Performance Comparison
|
143
|
-
|
144
|
-
| Method | Total Time | API Calls | Accuracy | Best For |
|
145
|
-
|--------|------------|-----------|----------|----------|
|
146
|
-
| Sequential | ~45 min | 44 calls | Highest | Small datasets |
|
147
|
-
| **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
|
148
|
-
| Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
|
149
135
|
|
150
136
|
## Advanced Usage
|
151
137
|
|
@@ -169,31 +155,6 @@ python -m debase.substrate_scope_extractor_parallel \
|
|
169
155
|
--manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
|
170
156
|
--max-workers 5 --output substrate_scope.csv
|
171
157
|
```
|
172
|
-
|
173
|
-
## Python API
|
174
|
-
|
175
|
-
```python
|
176
|
-
from debase.wrapper import run_pipeline
|
177
|
-
|
178
|
-
# Run full pipeline with parallel processing
|
179
|
-
run_pipeline(
|
180
|
-
manuscript_path="paper.pdf",
|
181
|
-
si_path="si.pdf",
|
182
|
-
output="output.csv",
|
183
|
-
use_parallel_individual=True,
|
184
|
-
max_workers=5
|
185
|
-
)
|
186
|
-
|
187
|
-
# For individual steps
|
188
|
-
from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
|
189
|
-
from debase.enzyme_lineage_extractor import setup_gemini_api
|
190
|
-
|
191
|
-
model = setup_gemini_api()
|
192
|
-
reaction_data = extract_reaction_info_parallel(
|
193
|
-
model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
|
194
|
-
)
|
195
|
-
```
|
196
|
-
|
197
158
|
## Pipeline Architecture
|
198
159
|
|
199
160
|
The DEBase pipeline consists of 5 main steps:
|
@@ -222,9 +183,6 @@ The DEBase pipeline consists of 5 main steps:
|
|
222
183
|
- **External database integration:** Automatic sequence fetching from PDB and UniProt
|
223
184
|
- **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
|
224
185
|
- **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
|
225
|
-
- **Progress tracking:** Real-time status updates
|
226
|
-
- **Flexible output:** CSV format with comprehensive chemical and performance data
|
227
|
-
- **Caching:** PDF encoding cache for improved performance
|
228
186
|
- **Vision capabilities:** Extracts data from both text and images in PDFs
|
229
187
|
|
230
188
|
## Complete Command Reference
|
@@ -234,7 +192,6 @@ The DEBase pipeline consists of 5 main steps:
|
|
234
192
|
--manuscript PATH # Required: Path to manuscript PDF
|
235
193
|
--si PATH # Optional: Path to supplementary information PDF
|
236
194
|
--output PATH # Output file path (default: manuscript_name_debase.csv)
|
237
|
-
--queries N # Number of consensus queries (default: 2)
|
238
195
|
```
|
239
196
|
|
240
197
|
### Performance Options
|
@@ -279,21 +236,5 @@ The DEBase pipeline consists of 5 main steps:
|
|
279
236
|
3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
|
280
237
|
4. **Skip validation** (`--skip-validation`) for faster processing in production
|
281
238
|
5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
|
282
|
-
6.
|
283
|
-
7. **Verify enzyme entries** - The system automatically filters out buffers and controls
|
284
|
-
|
285
|
-
## Troubleshooting
|
286
|
-
|
287
|
-
### No sequences found
|
288
|
-
- The extractor will automatically search PDB and UniProt databases
|
289
|
-
- Check the logs for which database IDs were found and attempted
|
290
|
-
- Sequences with PDB structures will be fetched with high confidence
|
291
|
-
|
292
|
-
### Incorrect enzyme extraction
|
293
|
-
- Non-enzyme entries (buffers, controls, media) are automatically filtered
|
294
|
-
- Check the log for entries marked as "Filtering out non-enzyme entry"
|
239
|
+
6.
|
295
240
|
|
296
|
-
### PDB matching issues
|
297
|
-
- The system uses AI to match PDB IDs to specific enzyme variants
|
298
|
-
- Increased context extraction ensures better matching accuracy
|
299
|
-
- Check logs for "Gemini PDB matching" entries to see the matching process
|
@@ -7,13 +7,6 @@ Enzyme lineage analysis and sequence extraction package with advanced parallel p
|
|
7
7
|
```bash
|
8
8
|
pip install debase
|
9
9
|
```
|
10
|
-
|
11
|
-
For full functionality with chemical SMILES support:
|
12
|
-
|
13
|
-
```bash
|
14
|
-
pip install debase[rdkit]
|
15
|
-
```
|
16
|
-
|
17
10
|
## Requirements
|
18
11
|
|
19
12
|
- Python 3.8 or higher
|
@@ -82,13 +75,6 @@ debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-ba
|
|
82
75
|
debase --manuscript paper.pdf --si si.pdf # Default method
|
83
76
|
```
|
84
77
|
|
85
|
-
## Performance Comparison
|
86
|
-
|
87
|
-
| Method | Total Time | API Calls | Accuracy | Best For |
|
88
|
-
|--------|------------|-----------|----------|----------|
|
89
|
-
| Sequential | ~45 min | 44 calls | Highest | Small datasets |
|
90
|
-
| **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
|
91
|
-
| Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
|
92
78
|
|
93
79
|
## Advanced Usage
|
94
80
|
|
@@ -112,31 +98,6 @@ python -m debase.substrate_scope_extractor_parallel \
|
|
112
98
|
--manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
|
113
99
|
--max-workers 5 --output substrate_scope.csv
|
114
100
|
```
|
115
|
-
|
116
|
-
## Python API
|
117
|
-
|
118
|
-
```python
|
119
|
-
from debase.wrapper import run_pipeline
|
120
|
-
|
121
|
-
# Run full pipeline with parallel processing
|
122
|
-
run_pipeline(
|
123
|
-
manuscript_path="paper.pdf",
|
124
|
-
si_path="si.pdf",
|
125
|
-
output="output.csv",
|
126
|
-
use_parallel_individual=True,
|
127
|
-
max_workers=5
|
128
|
-
)
|
129
|
-
|
130
|
-
# For individual steps
|
131
|
-
from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
|
132
|
-
from debase.enzyme_lineage_extractor import setup_gemini_api
|
133
|
-
|
134
|
-
model = setup_gemini_api()
|
135
|
-
reaction_data = extract_reaction_info_parallel(
|
136
|
-
model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
|
137
|
-
)
|
138
|
-
```
|
139
|
-
|
140
101
|
## Pipeline Architecture
|
141
102
|
|
142
103
|
The DEBase pipeline consists of 5 main steps:
|
@@ -165,9 +126,6 @@ The DEBase pipeline consists of 5 main steps:
|
|
165
126
|
- **External database integration:** Automatic sequence fetching from PDB and UniProt
|
166
127
|
- **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
|
167
128
|
- **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
|
168
|
-
- **Progress tracking:** Real-time status updates
|
169
|
-
- **Flexible output:** CSV format with comprehensive chemical and performance data
|
170
|
-
- **Caching:** PDF encoding cache for improved performance
|
171
129
|
- **Vision capabilities:** Extracts data from both text and images in PDFs
|
172
130
|
|
173
131
|
## Complete Command Reference
|
@@ -177,7 +135,6 @@ The DEBase pipeline consists of 5 main steps:
|
|
177
135
|
--manuscript PATH # Required: Path to manuscript PDF
|
178
136
|
--si PATH # Optional: Path to supplementary information PDF
|
179
137
|
--output PATH # Output file path (default: manuscript_name_debase.csv)
|
180
|
-
--queries N # Number of consensus queries (default: 2)
|
181
138
|
```
|
182
139
|
|
183
140
|
### Performance Options
|
@@ -222,21 +179,5 @@ The DEBase pipeline consists of 5 main steps:
|
|
222
179
|
3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
|
223
180
|
4. **Skip validation** (`--skip-validation`) for faster processing in production
|
224
181
|
5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
|
225
|
-
6.
|
226
|
-
7. **Verify enzyme entries** - The system automatically filters out buffers and controls
|
227
|
-
|
228
|
-
## Troubleshooting
|
229
|
-
|
230
|
-
### No sequences found
|
231
|
-
- The extractor will automatically search PDB and UniProt databases
|
232
|
-
- Check the logs for which database IDs were found and attempted
|
233
|
-
- Sequences with PDB structures will be fetched with high confidence
|
234
|
-
|
235
|
-
### Incorrect enzyme extraction
|
236
|
-
- Non-enzyme entries (buffers, controls, media) are automatically filtered
|
237
|
-
- Check the log for entries marked as "Filtering out non-enzyme entry"
|
182
|
+
6.
|
238
183
|
|
239
|
-
### PDB matching issues
|
240
|
-
- The system uses AI to match PDB IDs to specific enzyme variants
|
241
|
-
- Increased context extraction ensures better matching accuracy
|
242
|
-
- Check logs for "Gemini PDB matching" entries to see the matching process
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# DEBase Documentation
|
2
|
+
|
3
|
+
This directory contains comprehensive documentation for the DEBase enzyme analysis pipeline.
|
4
|
+
|
5
|
+
## Directory Structure
|
6
|
+
|
7
|
+
- `api/` - API documentation and reference
|
8
|
+
- `examples/` - Usage examples and tutorials
|
9
|
+
- `tutorials/` - Step-by-step guides
|
10
|
+
|
11
|
+
## Quick Start
|
12
|
+
|
13
|
+
See the main [README.md](../README.md) for installation and basic usage.
|
14
|
+
|
15
|
+
## Contents
|
16
|
+
|
17
|
+
1. [Installation Guide](tutorials/installation.md)
|
18
|
+
2. [API Reference](api/README.md)
|
19
|
+
3. [Usage Examples](examples/README.md)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# DEBase Examples
|
2
|
+
|
3
|
+
This directory contains example outputs and usage demonstrations for the DEBase pipeline.
|
4
|
+
|
5
|
+
## Example Outputs
|
6
|
+
|
7
|
+
The `../../examples/` directory contains sample results from successful pipeline runs:
|
8
|
+
|
9
|
+
- `trpb_complete_pipeline.csv` - Complete TrpB enzyme dataset with sequences, mutations, and reactions
|
10
|
+
- `carbene_complete_pipeline.csv` - Carbene transfer enzyme data with SMILES and conditions
|
11
|
+
- `REFINED_ENZYME_SEQUENCES.csv` - Refined sequence extraction results
|
12
|
+
|
13
|
+
## Data Format
|
14
|
+
|
15
|
+
Each CSV contains:
|
16
|
+
- Full-length protein sequences (200-400+ amino acids)
|
17
|
+
- Complete mutation lineage tracking
|
18
|
+
- Chemical reaction SMILES strings
|
19
|
+
- Experimental conditions and metadata
|
20
|
+
- Performance metrics (yield, TTN, enantioselectivity)
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
These files demonstrate the expected output format and can be used as reference for pipeline validation.
|
@@ -0,0 +1,21 @@
|
|
1
|
+
name: debase
|
2
|
+
channels:
|
3
|
+
- conda-forge
|
4
|
+
- defaults
|
5
|
+
dependencies:
|
6
|
+
- python=3.9
|
7
|
+
- pandas>=1.0.0
|
8
|
+
- numpy>=1.19.0
|
9
|
+
- matplotlib>=3.3.0
|
10
|
+
- seaborn>=0.11.0
|
11
|
+
- jupyter>=1.0.0
|
12
|
+
- jupyterlab>=3.0.0
|
13
|
+
- openpyxl>=3.0.0
|
14
|
+
- biopython>=1.78
|
15
|
+
- requests>=2.25.0
|
16
|
+
- tqdm>=4.60.0
|
17
|
+
- rdkit>=2020.03.1
|
18
|
+
- pip
|
19
|
+
- pip:
|
20
|
+
- PyMuPDF>=1.18.0
|
21
|
+
- google-generativeai>=0.3.0
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# DEBase Pipeline Flow
|
2
|
+
|
3
|
+
## Overview
|
4
|
+
The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
|
5
|
+
|
6
|
+
## Pipeline Architecture
|
7
|
+
|
8
|
+
```
|
9
|
+
┌─────────────────────┐ ┌─────────────────────┐
|
10
|
+
│ Manuscript PDF │ │ SI PDF │
|
11
|
+
└──────────┬──────────┘ └──────────┬──────────┘
|
12
|
+
│ │
|
13
|
+
└───────────┬───────────────┘
|
14
|
+
│
|
15
|
+
▼
|
16
|
+
┌─────────────────────────────┐
|
17
|
+
│ 1. enzyme_lineage_extractor │
|
18
|
+
│ - Extract enzyme variants │
|
19
|
+
│ - Parse mutations │
|
20
|
+
│ - Get basic metadata │
|
21
|
+
└─────────────┬───────────────┘
|
22
|
+
│
|
23
|
+
▼
|
24
|
+
┌─────────────────────────────┐
|
25
|
+
│ 2. cleanup_sequence │
|
26
|
+
│ - Validate sequences │
|
27
|
+
│ - Fix formatting issues │
|
28
|
+
│ - Generate full sequences │
|
29
|
+
└─────────────┬───────────────┘
|
30
|
+
│
|
31
|
+
┌───────────┴───────────────┐
|
32
|
+
│ │
|
33
|
+
▼ ▼
|
34
|
+
┌─────────────────────────┐ ┌─────────────────────────┐
|
35
|
+
│ 3a. reaction_info │ │ 3b. substrate_scope │
|
36
|
+
│ _extractor │ │ _extractor │
|
37
|
+
│ - Performance metrics │ │ - Substrate variations │
|
38
|
+
│ - Model reaction │ │ - Additional variants │
|
39
|
+
│ - Conditions │ │ - Scope data │
|
40
|
+
└───────────┬─────────────┘ └───────────┬─────────────┘
|
41
|
+
│ │
|
42
|
+
└───────────┬───────────────┘
|
43
|
+
│
|
44
|
+
▼
|
45
|
+
┌─────────────────────────────┐
|
46
|
+
│ 4. lineage_format_o3 │
|
47
|
+
│ - Merge all data │
|
48
|
+
│ - Fill missing sequences │
|
49
|
+
│ - Format final output │
|
50
|
+
└─────────────┬───────────────┘
|
51
|
+
│
|
52
|
+
▼
|
53
|
+
┌─────────────┐
|
54
|
+
│ Final CSV │
|
55
|
+
└─────────────┘
|
56
|
+
```
|
57
|
+
|
58
|
+
## Module Details
|
59
|
+
|
60
|
+
### 1. enzyme_lineage_extractor.py
|
61
|
+
- **Input**: Manuscript PDF, SI PDF
|
62
|
+
- **Output**: CSV with enzyme variants and mutations
|
63
|
+
- **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
|
64
|
+
|
65
|
+
### 2. cleanup_sequence.py
|
66
|
+
- **Input**: Enzyme lineage CSV
|
67
|
+
- **Output**: CSV with validated sequences
|
68
|
+
- **Function**: Validates protein sequences, generates full sequences from mutations
|
69
|
+
|
70
|
+
### 3a. reaction_info_extractor.py
|
71
|
+
- **Input**: PDFs + cleaned enzyme CSV
|
72
|
+
- **Output**: CSV with reaction performance data
|
73
|
+
- **Function**: Extracts yield, TTN, selectivity, reaction conditions
|
74
|
+
|
75
|
+
### 3b. substrate_scope_extractor.py
|
76
|
+
- **Input**: PDFs + cleaned enzyme CSV
|
77
|
+
- **Output**: CSV with substrate scope entries
|
78
|
+
- **Function**: Extracts substrate variations tested with different enzymes
|
79
|
+
|
80
|
+
### 4. lineage_format_o3.py
|
81
|
+
- **Input**: Reaction CSV + Substrate scope CSV
|
82
|
+
- **Output**: Final formatted CSV
|
83
|
+
- **Function**: Merges data, fills missing sequences, applies consistent formatting
|
84
|
+
|
85
|
+
## Key Features
|
86
|
+
|
87
|
+
1. **Modular Design**: Each step can be run independently
|
88
|
+
2. **Parallel Extraction**: Steps 3a and 3b run independently
|
89
|
+
3. **Error Recovery**: Pipeline can resume from any step
|
90
|
+
4. **Clean Interfaces**: Each module has well-defined inputs/outputs
|
91
|
+
|
92
|
+
## Usage
|
93
|
+
|
94
|
+
```bash
|
95
|
+
# Full pipeline
|
96
|
+
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
|
97
|
+
|
98
|
+
# With intermediate files kept for debugging
|
99
|
+
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
|
100
|
+
```
|
@@ -1297,6 +1297,8 @@ _SEQUENCE_SCHEMA_HINT = """
|
|
1297
1297
|
_SEQ_LOC_PROMPT = """
|
1298
1298
|
Find where FULL-LENGTH protein or DNA sequences are located in this document.
|
1299
1299
|
|
1300
|
+
PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
|
1301
|
+
|
1300
1302
|
Look for table of contents entries or section listings that mention sequences.
|
1301
1303
|
Return a JSON array where each element has:
|
1302
1304
|
- "section": the section heading or description
|
@@ -1305,6 +1307,7 @@ Return a JSON array where each element has:
|
|
1305
1307
|
Focus on:
|
1306
1308
|
- Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
|
1307
1309
|
- Return the EXACT notation as shown.
|
1310
|
+
- Prioritize sections that mention "protein" or "amino acid" sequences
|
1308
1311
|
|
1309
1312
|
Return [] if no sequence sections are found.
|
1310
1313
|
Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
|
@@ -1465,10 +1468,16 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
|
|
1465
1468
|
# --- 7.3 Main extraction prompt ---------------------------------------------
|
1466
1469
|
_SEQ_EXTRACTION_PROMPT = """
|
1467
1470
|
Extract EVERY distinct enzyme-variant sequence you can find in the text.
|
1471
|
+
|
1472
|
+
IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
|
1473
|
+
- If an amino acid sequence exists for a variant, extract ONLY the aa_seq (set dna_seq to null)
|
1474
|
+
- Only extract dna_seq if NO amino acid sequence is available for that variant
|
1475
|
+
- This reduces redundancy since protein sequences are usually more relevant
|
1476
|
+
|
1468
1477
|
For each variant return:
|
1469
1478
|
* variant_id - the label used in the paper (e.g. "R4-10")
|
1470
1479
|
* aa_seq - amino-acid sequence (uppercase), or null
|
1471
|
-
* dna_seq - DNA sequence (A/C/G/T), or null
|
1480
|
+
* dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
|
1472
1481
|
|
1473
1482
|
Respond ONLY with **minified JSON** that matches the schema below.
|
1474
1483
|
NO markdown, no code fences, no commentary.
|
@@ -685,7 +685,7 @@ Ignore locations that contain data for other campaigns.
|
|
685
685
|
'confidence': 95
|
686
686
|
}
|
687
687
|
|
688
|
-
def find_lineage_model_reaction(self, location: str, group_context: str) -> Dict[str, Any]:
|
688
|
+
def find_lineage_model_reaction(self, location: str, group_context: str, model_reaction_locations: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
689
689
|
"""Find the model reaction for a specific lineage group."""
|
690
690
|
# Gather relevant text near this location
|
691
691
|
page_text = self._page_with_reference(location) or ""
|
@@ -693,6 +693,7 @@ Ignore locations that contain data for other campaigns.
|
|
693
693
|
# Also check manuscript introduction for model reaction info
|
694
694
|
intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
|
695
695
|
|
696
|
+
# Build the prompt with location and context
|
696
697
|
prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
|
697
698
|
location=location,
|
698
699
|
group_context=group_context
|
@@ -700,6 +701,22 @@ Ignore locations that contain data for other campaigns.
|
|
700
701
|
prompt += f"\n\nText near {location}:\n{page_text[:3000]}"
|
701
702
|
prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
|
702
703
|
|
704
|
+
# If we have model reaction locations, include text from those locations too
|
705
|
+
if model_reaction_locations:
|
706
|
+
# Add text from model reaction location
|
707
|
+
if model_reaction_locations.get("model_reaction_location", {}).get("location"):
|
708
|
+
model_loc = model_reaction_locations["model_reaction_location"]["location"]
|
709
|
+
model_text = self._get_text_around_location(model_loc)
|
710
|
+
if model_text:
|
711
|
+
prompt += f"\n\nText from {model_loc} (potential model reaction location):\n{model_text[:3000]}"
|
712
|
+
|
713
|
+
# Add text from conditions location (often contains reaction details)
|
714
|
+
if model_reaction_locations.get("conditions_location", {}).get("location"):
|
715
|
+
cond_loc = model_reaction_locations["conditions_location"]["location"]
|
716
|
+
cond_text = self._get_text_around_location(cond_loc)
|
717
|
+
if cond_text:
|
718
|
+
prompt += f"\n\nText from {cond_loc} (reaction conditions):\n{cond_text[:3000]}"
|
719
|
+
|
703
720
|
try:
|
704
721
|
data = generate_json_with_retry(
|
705
722
|
self.model,
|
@@ -1038,7 +1055,20 @@ Different campaigns may use different model reactions.
|
|
1038
1055
|
"""Extract text around a given location identifier."""
|
1039
1056
|
location_lower = location.lower()
|
1040
1057
|
|
1041
|
-
#
|
1058
|
+
# Handle compound locations like "Figure 2 caption and Section I"
|
1059
|
+
# Extract the first figure/table/scheme reference
|
1060
|
+
figure_match = re.search(r"(figure|scheme|table)\s*\d+", location_lower)
|
1061
|
+
if figure_match:
|
1062
|
+
primary_location = figure_match.group(0)
|
1063
|
+
# Try to find this primary location first
|
1064
|
+
for page_text in self.all_pages:
|
1065
|
+
if primary_location in page_text.lower():
|
1066
|
+
idx = page_text.lower().index(primary_location)
|
1067
|
+
start = max(0, idx - 500)
|
1068
|
+
end = min(len(page_text), idx + 3000)
|
1069
|
+
return page_text[start:end]
|
1070
|
+
|
1071
|
+
# Search in all pages for exact match
|
1042
1072
|
for page_text in self.all_pages:
|
1043
1073
|
if location_lower in page_text.lower():
|
1044
1074
|
# Find the location and extract context around it
|
@@ -1790,8 +1820,16 @@ TEXT FROM MANUSCRIPT:
|
|
1790
1820
|
if location.get('caption'):
|
1791
1821
|
location_context += f"\nCaption: {location['caption']}"
|
1792
1822
|
|
1793
|
-
#
|
1794
|
-
|
1823
|
+
# First find model reaction locations for this campaign/enzyme group
|
1824
|
+
location_enzymes = df_location['enzyme'].unique().tolist()
|
1825
|
+
model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
|
1826
|
+
|
1827
|
+
# Try to find model reaction for this specific lineage, passing the locations
|
1828
|
+
location_model_reaction = self.find_lineage_model_reaction(
|
1829
|
+
location['location'],
|
1830
|
+
location_context,
|
1831
|
+
model_reaction_locations
|
1832
|
+
)
|
1795
1833
|
|
1796
1834
|
# Get full model reaction info with IUPAC names
|
1797
1835
|
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
@@ -1799,7 +1837,6 @@ TEXT FROM MANUSCRIPT:
|
|
1799
1837
|
else:
|
1800
1838
|
# Fall back to general model reaction extraction
|
1801
1839
|
# Pass the enzyme variants from this location
|
1802
|
-
location_enzymes = df_location['enzyme'].unique().tolist()
|
1803
1840
|
model_info = self.gather_model_reaction_info(location_enzymes)
|
1804
1841
|
|
1805
1842
|
# Add model reaction info to all enzymes from this location
|
@@ -1891,7 +1928,16 @@ TEXT FROM MANUSCRIPT:
|
|
1891
1928
|
if group.get('caption'):
|
1892
1929
|
location_context += f"\nCaption: {group['caption']}"
|
1893
1930
|
|
1894
|
-
|
1931
|
+
# First find model reaction locations for this enzyme group
|
1932
|
+
location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
|
1933
|
+
model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
|
1934
|
+
|
1935
|
+
# Try to find model reaction for this specific lineage, passing the locations
|
1936
|
+
location_model_reaction = self.find_lineage_model_reaction(
|
1937
|
+
group_location,
|
1938
|
+
location_context,
|
1939
|
+
model_reaction_locations
|
1940
|
+
)
|
1895
1941
|
|
1896
1942
|
# Get full model reaction info with IUPAC names
|
1897
1943
|
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
@@ -1899,7 +1945,6 @@ TEXT FROM MANUSCRIPT:
|
|
1899
1945
|
else:
|
1900
1946
|
# Try to extract model reaction from this specific location
|
1901
1947
|
# Pass the enzyme variants that have data in this location
|
1902
|
-
location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
|
1903
1948
|
model_info = self.gather_model_reaction_info(location_enzymes)
|
1904
1949
|
|
1905
1950
|
# Add model reaction info to all enzymes from this location
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: debase
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.3
|
4
4
|
Summary: Enzyme lineage analysis and sequence extraction package
|
5
5
|
Home-page: https://github.com/YuemingLong/DEBase
|
6
6
|
Author: DEBase Team
|
@@ -64,13 +64,6 @@ Enzyme lineage analysis and sequence extraction package with advanced parallel p
|
|
64
64
|
```bash
|
65
65
|
pip install debase
|
66
66
|
```
|
67
|
-
|
68
|
-
For full functionality with chemical SMILES support:
|
69
|
-
|
70
|
-
```bash
|
71
|
-
pip install debase[rdkit]
|
72
|
-
```
|
73
|
-
|
74
67
|
## Requirements
|
75
68
|
|
76
69
|
- Python 3.8 or higher
|
@@ -139,13 +132,6 @@ debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-ba
|
|
139
132
|
debase --manuscript paper.pdf --si si.pdf # Default method
|
140
133
|
```
|
141
134
|
|
142
|
-
## Performance Comparison
|
143
|
-
|
144
|
-
| Method | Total Time | API Calls | Accuracy | Best For |
|
145
|
-
|--------|------------|-----------|----------|----------|
|
146
|
-
| Sequential | ~45 min | 44 calls | Highest | Small datasets |
|
147
|
-
| **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
|
148
|
-
| Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
|
149
135
|
|
150
136
|
## Advanced Usage
|
151
137
|
|
@@ -169,31 +155,6 @@ python -m debase.substrate_scope_extractor_parallel \
|
|
169
155
|
--manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
|
170
156
|
--max-workers 5 --output substrate_scope.csv
|
171
157
|
```
|
172
|
-
|
173
|
-
## Python API
|
174
|
-
|
175
|
-
```python
|
176
|
-
from debase.wrapper import run_pipeline
|
177
|
-
|
178
|
-
# Run full pipeline with parallel processing
|
179
|
-
run_pipeline(
|
180
|
-
manuscript_path="paper.pdf",
|
181
|
-
si_path="si.pdf",
|
182
|
-
output="output.csv",
|
183
|
-
use_parallel_individual=True,
|
184
|
-
max_workers=5
|
185
|
-
)
|
186
|
-
|
187
|
-
# For individual steps
|
188
|
-
from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
|
189
|
-
from debase.enzyme_lineage_extractor import setup_gemini_api
|
190
|
-
|
191
|
-
model = setup_gemini_api()
|
192
|
-
reaction_data = extract_reaction_info_parallel(
|
193
|
-
model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
|
194
|
-
)
|
195
|
-
```
|
196
|
-
|
197
158
|
## Pipeline Architecture
|
198
159
|
|
199
160
|
The DEBase pipeline consists of 5 main steps:
|
@@ -222,9 +183,6 @@ The DEBase pipeline consists of 5 main steps:
|
|
222
183
|
- **External database integration:** Automatic sequence fetching from PDB and UniProt
|
223
184
|
- **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
|
224
185
|
- **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
|
225
|
-
- **Progress tracking:** Real-time status updates
|
226
|
-
- **Flexible output:** CSV format with comprehensive chemical and performance data
|
227
|
-
- **Caching:** PDF encoding cache for improved performance
|
228
186
|
- **Vision capabilities:** Extracts data from both text and images in PDFs
|
229
187
|
|
230
188
|
## Complete Command Reference
|
@@ -234,7 +192,6 @@ The DEBase pipeline consists of 5 main steps:
|
|
234
192
|
--manuscript PATH # Required: Path to manuscript PDF
|
235
193
|
--si PATH # Optional: Path to supplementary information PDF
|
236
194
|
--output PATH # Output file path (default: manuscript_name_debase.csv)
|
237
|
-
--queries N # Number of consensus queries (default: 2)
|
238
195
|
```
|
239
196
|
|
240
197
|
### Performance Options
|
@@ -279,21 +236,5 @@ The DEBase pipeline consists of 5 main steps:
|
|
279
236
|
3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
|
280
237
|
4. **Skip validation** (`--skip-validation`) for faster processing in production
|
281
238
|
5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
|
282
|
-
6.
|
283
|
-
7. **Verify enzyme entries** - The system automatically filters out buffers and controls
|
284
|
-
|
285
|
-
## Troubleshooting
|
286
|
-
|
287
|
-
### No sequences found
|
288
|
-
- The extractor will automatically search PDB and UniProt databases
|
289
|
-
- Check the logs for which database IDs were found and attempted
|
290
|
-
- Sequences with PDB structures will be fetched with high confidence
|
291
|
-
|
292
|
-
### Incorrect enzyme extraction
|
293
|
-
- Non-enzyme entries (buffers, controls, media) are automatically filtered
|
294
|
-
- Check the log for entries marked as "Filtering out non-enzyme entry"
|
239
|
+
6.
|
295
240
|
|
296
|
-
### PDB matching issues
|
297
|
-
- The system uses AI to match PDB IDs to specific enzyme variants
|
298
|
-
- Increased context extraction ensures better matching accuracy
|
299
|
-
- Check logs for "Gemini PDB matching" entries to see the matching process
|
@@ -1,8 +1,15 @@
|
|
1
|
+
.gitignore
|
2
|
+
CONTRIBUTING.md
|
1
3
|
LICENSE
|
2
4
|
MANIFEST.in
|
3
5
|
README.md
|
6
|
+
environment.yml
|
4
7
|
pyproject.toml
|
5
8
|
setup.py
|
9
|
+
docs/README.md
|
10
|
+
docs/examples/README.md
|
11
|
+
src/__init__.py
|
12
|
+
src/debase/PIPELINE_FLOW.md
|
6
13
|
src/debase/__init__.py
|
7
14
|
src/debase/__main__.py
|
8
15
|
src/debase/_version.py
|
@@ -0,0 +1 @@
|
|
1
|
+
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|