debase 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. debase-0.1.3/.gitignore +177 -0
  2. debase-0.1.3/CONTRIBUTING.md +61 -0
  3. {debase-0.1.1/src/debase.egg-info → debase-0.1.3}/PKG-INFO +2 -61
  4. {debase-0.1.1 → debase-0.1.3}/README.md +1 -60
  5. debase-0.1.3/docs/README.md +19 -0
  6. debase-0.1.3/docs/examples/README.md +24 -0
  7. debase-0.1.3/environment.yml +21 -0
  8. debase-0.1.3/src/debase/PIPELINE_FLOW.md +100 -0
  9. {debase-0.1.1 → debase-0.1.3}/src/debase/_version.py +1 -1
  10. {debase-0.1.1 → debase-0.1.3}/src/debase/enzyme_lineage_extractor.py +10 -1
  11. {debase-0.1.1 → debase-0.1.3}/src/debase/reaction_info_extractor.py +52 -7
  12. {debase-0.1.1 → debase-0.1.3/src/debase.egg-info}/PKG-INFO +2 -61
  13. {debase-0.1.1 → debase-0.1.3}/src/debase.egg-info/SOURCES.txt +7 -0
  14. debase-0.1.3/src/debase.egg-info/dependency_links.txt +1 -0
  15. {debase-0.1.1 → debase-0.1.3}/LICENSE +0 -0
  16. {debase-0.1.1 → debase-0.1.3}/MANIFEST.in +0 -0
  17. {debase-0.1.1 → debase-0.1.3}/pyproject.toml +0 -0
  18. {debase-0.1.1 → debase-0.1.3}/setup.cfg +0 -0
  19. {debase-0.1.1 → debase-0.1.3}/setup.py +0 -0
  20. /debase-0.1.1/src/debase.egg-info/dependency_links.txt → /debase-0.1.3/src/__init__.py +0 -0
  21. {debase-0.1.1 → debase-0.1.3}/src/debase/__init__.py +0 -0
  22. {debase-0.1.1 → debase-0.1.3}/src/debase/__main__.py +0 -0
  23. {debase-0.1.1 → debase-0.1.3}/src/debase/build_db.py +0 -0
  24. {debase-0.1.1 → debase-0.1.3}/src/debase/cleanup_sequence.py +0 -0
  25. {debase-0.1.1 → debase-0.1.3}/src/debase/lineage_format.py +0 -0
  26. {debase-0.1.1 → debase-0.1.3}/src/debase/substrate_scope_extractor.py +0 -0
  27. {debase-0.1.1 → debase-0.1.3}/src/debase/wrapper.py +0 -0
  28. {debase-0.1.1 → debase-0.1.3}/src/debase.egg-info/entry_points.txt +0 -0
  29. {debase-0.1.1 → debase-0.1.3}/src/debase.egg-info/requires.txt +0 -0
  30. {debase-0.1.1 → debase-0.1.3}/src/debase.egg-info/top_level.txt +0 -0
@@ -0,0 +1,177 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ *.py,cover
48
+ .hypothesis/
49
+ .pytest_cache/
50
+ cover/
51
+
52
+ # Jupyter Notebook
53
+ .ipynb_checkpoints
54
+
55
+ # IPython
56
+ profile_default/
57
+ ipython_config.py
58
+
59
+ # pyenv
60
+ .python-version
61
+
62
+ # pipenv
63
+ Pipfile.lock
64
+
65
+ # poetry
66
+ poetry.lock
67
+
68
+ # pdm
69
+ .pdm.toml
70
+
71
+ # PEP 582
72
+ __pypackages__/
73
+
74
+ # Celery stuff
75
+ celerybeat-schedule
76
+ celerybeat.pid
77
+
78
+ # SageMath parsed files
79
+ *.sage.py
80
+
81
+ # Environments
82
+ .env
83
+ .venv
84
+ env/
85
+ venv/
86
+ ENV/
87
+ env.bak/
88
+ venv.bak/
89
+
90
+ # Spyder project settings
91
+ .spyderproject
92
+ .spyproject
93
+
94
+ # Rope project settings
95
+ .ropeproject
96
+
97
+ # mkdocs documentation
98
+ /site
99
+
100
+ # mypy
101
+ .mypy_cache/
102
+ .dmypy.json
103
+ dmypy.json
104
+
105
+ # Pyre type checker
106
+ .pyre/
107
+
108
+ # pytype static type analyzer
109
+ .pytype/
110
+
111
+ # Cython debug symbols
112
+ cython_debug/
113
+
114
+ # PyCharm
115
+ .idea/
116
+
117
+ # VS Code
118
+ .vscode/
119
+
120
+ # macOS
121
+ .DS_Store
122
+ .AppleDouble
123
+ .LSOverride
124
+
125
+ # Windows
126
+ Thumbs.db
127
+ Thumbs.db:encryptable
128
+ ehthumbs.db
129
+ ehthumbs_vista.db
130
+ *.stackdump
131
+ [Dd]esktop.ini
132
+ $RECYCLE.BIN/
133
+ *.cab
134
+ *.msi
135
+ *.msix
136
+ *.msm
137
+ *.msp
138
+ *.lnk
139
+
140
+ # Linux
141
+ *~
142
+
143
+ # Temporary files
144
+ *.tmp
145
+ *.temp
146
+ *.log
147
+ .temp_*/
148
+ .cache/
149
+
150
+ # DEBase specific
151
+ enzyme_pipeline*.log
152
+ temp_merged_input.csv
153
+ *.egg-info/
154
+
155
+ # Project data and examples
156
+ data/
157
+ examples/
158
+ !examples/test.csv # Keep test.csv as example output
159
+
160
+ # Cache files
161
+ *.pkl
162
+ *_cache.pkl
163
+
164
+ # Large database files
165
+ *.db
166
+
167
+ # PDFs and Excel files
168
+ *.pdf
169
+ *.xlsx
170
+
171
+ # Backup files
172
+ *_backup.py
173
+ lineage_format_backup.py
174
+
175
+ # Temporary directories
176
+ .temp_*
177
+ enzyme_analysis_*
@@ -0,0 +1,61 @@
1
+ # Contributing to DEBase
2
+
3
+ Thank you for your interest in contributing to DEBase!
4
+
5
+ ## Development Setup
6
+
7
+ 1. Clone the repository:
8
+ ```bash
9
+ git clone https://github.com/yourusername/debase.git
10
+ cd debase
11
+ ```
12
+
13
+ 2. Create a virtual environment:
14
+ ```bash
15
+ python -m venv venv
16
+ source venv/bin/activate # On Windows: venv\Scripts\activate
17
+ ```
18
+
19
+ 3. Install in development mode:
20
+ ```bash
21
+ pip install -e ".[dev]"
22
+ ```
23
+
24
+ ## Running Tests
25
+
26
+ ```bash
27
+ pytest tests/
28
+ ```
29
+
30
+ ## Code Style
31
+
32
+ We use Black for code formatting:
33
+ ```bash
34
+ black src/ tests/
35
+ ```
36
+
37
+ And isort for import sorting:
38
+ ```bash
39
+ isort src/ tests/
40
+ ```
41
+
42
+ ## Project Structure
43
+
44
+ ```
45
+ debase/
46
+ ├── src/debase/ # Main package source code
47
+ ├── tests/ # Test suite
48
+ ├── docs/ # Documentation
49
+ ├── examples/ # Example outputs and usage
50
+ ├── data/ # Research data (PDFs)
51
+ └── scripts/ # Utility scripts
52
+ ```
53
+
54
+ ## Submitting Changes
55
+
56
+ 1. Fork the repository
57
+ 2. Create a feature branch
58
+ 3. Make your changes
59
+ 4. Add tests if applicable
60
+ 5. Run the test suite
61
+ 6. Submit a pull request
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -64,13 +64,6 @@ Enzyme lineage analysis and sequence extraction package with advanced parallel p
64
64
  ```bash
65
65
  pip install debase
66
66
  ```
67
-
68
- For full functionality with chemical SMILES support:
69
-
70
- ```bash
71
- pip install debase[rdkit]
72
- ```
73
-
74
67
  ## Requirements
75
68
 
76
69
  - Python 3.8 or higher
@@ -139,13 +132,6 @@ debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-ba
139
132
  debase --manuscript paper.pdf --si si.pdf # Default method
140
133
  ```
141
134
 
142
- ## Performance Comparison
143
-
144
- | Method | Total Time | API Calls | Accuracy | Best For |
145
- |--------|------------|-----------|----------|----------|
146
- | Sequential | ~45 min | 44 calls | Highest | Small datasets |
147
- | **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
148
- | Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
149
135
 
150
136
  ## Advanced Usage
151
137
 
@@ -169,31 +155,6 @@ python -m debase.substrate_scope_extractor_parallel \
169
155
  --manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
170
156
  --max-workers 5 --output substrate_scope.csv
171
157
  ```
172
-
173
- ## Python API
174
-
175
- ```python
176
- from debase.wrapper import run_pipeline
177
-
178
- # Run full pipeline with parallel processing
179
- run_pipeline(
180
- manuscript_path="paper.pdf",
181
- si_path="si.pdf",
182
- output="output.csv",
183
- use_parallel_individual=True,
184
- max_workers=5
185
- )
186
-
187
- # For individual steps
188
- from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
189
- from debase.enzyme_lineage_extractor import setup_gemini_api
190
-
191
- model = setup_gemini_api()
192
- reaction_data = extract_reaction_info_parallel(
193
- model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
194
- )
195
- ```
196
-
197
158
  ## Pipeline Architecture
198
159
 
199
160
  The DEBase pipeline consists of 5 main steps:
@@ -222,9 +183,6 @@ The DEBase pipeline consists of 5 main steps:
222
183
  - **External database integration:** Automatic sequence fetching from PDB and UniProt
223
184
  - **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
224
185
  - **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
225
- - **Progress tracking:** Real-time status updates
226
- - **Flexible output:** CSV format with comprehensive chemical and performance data
227
- - **Caching:** PDF encoding cache for improved performance
228
186
  - **Vision capabilities:** Extracts data from both text and images in PDFs
229
187
 
230
188
  ## Complete Command Reference
@@ -234,7 +192,6 @@ The DEBase pipeline consists of 5 main steps:
234
192
  --manuscript PATH # Required: Path to manuscript PDF
235
193
  --si PATH # Optional: Path to supplementary information PDF
236
194
  --output PATH # Output file path (default: manuscript_name_debase.csv)
237
- --queries N # Number of consensus queries (default: 2)
238
195
  ```
239
196
 
240
197
  ### Performance Options
@@ -279,21 +236,5 @@ The DEBase pipeline consists of 5 main steps:
279
236
  3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
280
237
  4. **Skip validation** (`--skip-validation`) for faster processing in production
281
238
  5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
282
- 6. **Check external databases** - Many sequences can be automatically fetched from PDB/UniProt
283
- 7. **Verify enzyme entries** - The system automatically filters out buffers and controls
284
-
285
- ## Troubleshooting
286
-
287
- ### No sequences found
288
- - The extractor will automatically search PDB and UniProt databases
289
- - Check the logs for which database IDs were found and attempted
290
- - Sequences with PDB structures will be fetched with high confidence
291
-
292
- ### Incorrect enzyme extraction
293
- - Non-enzyme entries (buffers, controls, media) are automatically filtered
294
- - Check the log for entries marked as "Filtering out non-enzyme entry"
239
+ 6.
295
240
 
296
- ### PDB matching issues
297
- - The system uses AI to match PDB IDs to specific enzyme variants
298
- - Increased context extraction ensures better matching accuracy
299
- - Check logs for "Gemini PDB matching" entries to see the matching process
@@ -7,13 +7,6 @@ Enzyme lineage analysis and sequence extraction package with advanced parallel p
7
7
  ```bash
8
8
  pip install debase
9
9
  ```
10
-
11
- For full functionality with chemical SMILES support:
12
-
13
- ```bash
14
- pip install debase[rdkit]
15
- ```
16
-
17
10
  ## Requirements
18
11
 
19
12
  - Python 3.8 or higher
@@ -82,13 +75,6 @@ debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-ba
82
75
  debase --manuscript paper.pdf --si si.pdf # Default method
83
76
  ```
84
77
 
85
- ## Performance Comparison
86
-
87
- | Method | Total Time | API Calls | Accuracy | Best For |
88
- |--------|------------|-----------|----------|----------|
89
- | Sequential | ~45 min | 44 calls | Highest | Small datasets |
90
- | **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
91
- | Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
92
78
 
93
79
  ## Advanced Usage
94
80
 
@@ -112,31 +98,6 @@ python -m debase.substrate_scope_extractor_parallel \
112
98
  --manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
113
99
  --max-workers 5 --output substrate_scope.csv
114
100
  ```
115
-
116
- ## Python API
117
-
118
- ```python
119
- from debase.wrapper import run_pipeline
120
-
121
- # Run full pipeline with parallel processing
122
- run_pipeline(
123
- manuscript_path="paper.pdf",
124
- si_path="si.pdf",
125
- output="output.csv",
126
- use_parallel_individual=True,
127
- max_workers=5
128
- )
129
-
130
- # For individual steps
131
- from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
132
- from debase.enzyme_lineage_extractor import setup_gemini_api
133
-
134
- model = setup_gemini_api()
135
- reaction_data = extract_reaction_info_parallel(
136
- model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
137
- )
138
- ```
139
-
140
101
  ## Pipeline Architecture
141
102
 
142
103
  The DEBase pipeline consists of 5 main steps:
@@ -165,9 +126,6 @@ The DEBase pipeline consists of 5 main steps:
165
126
  - **External database integration:** Automatic sequence fetching from PDB and UniProt
166
127
  - **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
167
128
  - **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
168
- - **Progress tracking:** Real-time status updates
169
- - **Flexible output:** CSV format with comprehensive chemical and performance data
170
- - **Caching:** PDF encoding cache for improved performance
171
129
  - **Vision capabilities:** Extracts data from both text and images in PDFs
172
130
 
173
131
  ## Complete Command Reference
@@ -177,7 +135,6 @@ The DEBase pipeline consists of 5 main steps:
177
135
  --manuscript PATH # Required: Path to manuscript PDF
178
136
  --si PATH # Optional: Path to supplementary information PDF
179
137
  --output PATH # Output file path (default: manuscript_name_debase.csv)
180
- --queries N # Number of consensus queries (default: 2)
181
138
  ```
182
139
 
183
140
  ### Performance Options
@@ -222,21 +179,5 @@ The DEBase pipeline consists of 5 main steps:
222
179
  3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
223
180
  4. **Skip validation** (`--skip-validation`) for faster processing in production
224
181
  5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
225
- 6. **Check external databases** - Many sequences can be automatically fetched from PDB/UniProt
226
- 7. **Verify enzyme entries** - The system automatically filters out buffers and controls
227
-
228
- ## Troubleshooting
229
-
230
- ### No sequences found
231
- - The extractor will automatically search PDB and UniProt databases
232
- - Check the logs for which database IDs were found and attempted
233
- - Sequences with PDB structures will be fetched with high confidence
234
-
235
- ### Incorrect enzyme extraction
236
- - Non-enzyme entries (buffers, controls, media) are automatically filtered
237
- - Check the log for entries marked as "Filtering out non-enzyme entry"
182
+ 6.
238
183
 
239
- ### PDB matching issues
240
- - The system uses AI to match PDB IDs to specific enzyme variants
241
- - Increased context extraction ensures better matching accuracy
242
- - Check logs for "Gemini PDB matching" entries to see the matching process
@@ -0,0 +1,19 @@
1
+ # DEBase Documentation
2
+
3
+ This directory contains comprehensive documentation for the DEBase enzyme analysis pipeline.
4
+
5
+ ## Directory Structure
6
+
7
+ - `api/` - API documentation and reference
8
+ - `examples/` - Usage examples and tutorials
9
+ - `tutorials/` - Step-by-step guides
10
+
11
+ ## Quick Start
12
+
13
+ See the main [README.md](../README.md) for installation and basic usage.
14
+
15
+ ## Contents
16
+
17
+ 1. [Installation Guide](tutorials/installation.md)
18
+ 2. [API Reference](api/README.md)
19
+ 3. [Usage Examples](examples/README.md)
@@ -0,0 +1,24 @@
1
+ # DEBase Examples
2
+
3
+ This directory contains example outputs and usage demonstrations for the DEBase pipeline.
4
+
5
+ ## Example Outputs
6
+
7
+ The `../../examples/` directory contains sample results from successful pipeline runs:
8
+
9
+ - `trpb_complete_pipeline.csv` - Complete TrpB enzyme dataset with sequences, mutations, and reactions
10
+ - `carbene_complete_pipeline.csv` - Carbene transfer enzyme data with SMILES and conditions
11
+ - `REFINED_ENZYME_SEQUENCES.csv` - Refined sequence extraction results
12
+
13
+ ## Data Format
14
+
15
+ Each CSV contains:
16
+ - Full-length protein sequences (200-400+ amino acids)
17
+ - Complete mutation lineage tracking
18
+ - Chemical reaction SMILES strings
19
+ - Experimental conditions and metadata
20
+ - Performance metrics (yield, TTN, enantioselectivity)
21
+
22
+ ## Usage
23
+
24
+ These files demonstrate the expected output format and can be used as reference for pipeline validation.
@@ -0,0 +1,21 @@
1
+ name: debase
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - python=3.9
7
+ - pandas>=1.0.0
8
+ - numpy>=1.19.0
9
+ - matplotlib>=3.3.0
10
+ - seaborn>=0.11.0
11
+ - jupyter>=1.0.0
12
+ - jupyterlab>=3.0.0
13
+ - openpyxl>=3.0.0
14
+ - biopython>=1.78
15
+ - requests>=2.25.0
16
+ - tqdm>=4.60.0
17
+ - rdkit>=2020.03.1
18
+ - pip
19
+ - pip:
20
+ - PyMuPDF>=1.18.0
21
+ - google-generativeai>=0.3.0
@@ -0,0 +1,100 @@
1
+ # DEBase Pipeline Flow
2
+
3
+ ## Overview
4
+ The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
5
+
6
+ ## Pipeline Architecture
7
+
8
+ ```
9
+ ┌─────────────────────┐ ┌─────────────────────┐
10
+ │ Manuscript PDF │ │ SI PDF │
11
+ └──────────┬──────────┘ └──────────┬──────────┘
12
+ │ │
13
+ └───────────┬───────────────┘
14
+
15
+
16
+ ┌─────────────────────────────┐
17
+ │ 1. enzyme_lineage_extractor │
18
+ │ - Extract enzyme variants │
19
+ │ - Parse mutations │
20
+ │ - Get basic metadata │
21
+ └─────────────┬───────────────┘
22
+
23
+
24
+ ┌─────────────────────────────┐
25
+ │ 2. cleanup_sequence │
26
+ │ - Validate sequences │
27
+ │ - Fix formatting issues │
28
+ │ - Generate full sequences │
29
+ └─────────────┬───────────────┘
30
+
31
+ ┌───────────┴───────────────┐
32
+ │ │
33
+ ▼ ▼
34
+ ┌─────────────────────────┐ ┌─────────────────────────┐
35
+ │ 3a. reaction_info │ │ 3b. substrate_scope │
36
+ │ _extractor │ │ _extractor │
37
+ │ - Performance metrics │ │ - Substrate variations │
38
+ │ - Model reaction │ │ - Additional variants │
39
+ │ - Conditions │ │ - Scope data │
40
+ └───────────┬─────────────┘ └───────────┬─────────────┘
41
+ │ │
42
+ └───────────┬───────────────┘
43
+
44
+
45
+ ┌─────────────────────────────┐
46
+ │ 4. lineage_format_o3 │
47
+ │ - Merge all data │
48
+ │ - Fill missing sequences │
49
+ │ - Format final output │
50
+ └─────────────┬───────────────┘
51
+
52
+
53
+ ┌─────────────┐
54
+ │ Final CSV │
55
+ └─────────────┘
56
+ ```
57
+
58
+ ## Module Details
59
+
60
+ ### 1. enzyme_lineage_extractor.py
61
+ - **Input**: Manuscript PDF, SI PDF
62
+ - **Output**: CSV with enzyme variants and mutations
63
+ - **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
64
+
65
+ ### 2. cleanup_sequence.py
66
+ - **Input**: Enzyme lineage CSV
67
+ - **Output**: CSV with validated sequences
68
+ - **Function**: Validates protein sequences, generates full sequences from mutations
69
+
70
+ ### 3a. reaction_info_extractor.py
71
+ - **Input**: PDFs + cleaned enzyme CSV
72
+ - **Output**: CSV with reaction performance data
73
+ - **Function**: Extracts yield, TTN, selectivity, reaction conditions
74
+
75
+ ### 3b. substrate_scope_extractor.py
76
+ - **Input**: PDFs + cleaned enzyme CSV
77
+ - **Output**: CSV with substrate scope entries
78
+ - **Function**: Extracts substrate variations tested with different enzymes
79
+
80
+ ### 4. lineage_format_o3.py
81
+ - **Input**: Reaction CSV + Substrate scope CSV
82
+ - **Output**: Final formatted CSV
83
+ - **Function**: Merges data, fills missing sequences, applies consistent formatting
84
+
85
+ ## Key Features
86
+
87
+ 1. **Modular Design**: Each step can be run independently
88
+ 2. **Parallel Extraction**: Steps 3a and 3b run independently
89
+ 3. **Error Recovery**: Pipeline can resume from any step
90
+ 4. **Clean Interfaces**: Each module has well-defined inputs/outputs
91
+
92
+ ## Usage
93
+
94
+ ```bash
95
+ # Full pipeline
96
+ python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
97
+
98
+ # With intermediate files kept for debugging
99
+ python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
100
+ ```
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.1"
3
+ __version__ = "0.1.3"
@@ -1297,6 +1297,8 @@ _SEQUENCE_SCHEMA_HINT = """
1297
1297
  _SEQ_LOC_PROMPT = """
1298
1298
  Find where FULL-LENGTH protein or DNA sequences are located in this document.
1299
1299
 
1300
+ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
1301
+
1300
1302
  Look for table of contents entries or section listings that mention sequences.
1301
1303
  Return a JSON array where each element has:
1302
1304
  - "section": the section heading or description
@@ -1305,6 +1307,7 @@ Return a JSON array where each element has:
1305
1307
  Focus on:
1306
1308
  - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
1307
1309
  - Return the EXACT notation as shown.
1310
+ - Prioritize sections that mention "protein" or "amino acid" sequences
1308
1311
 
1309
1312
  Return [] if no sequence sections are found.
1310
1313
  Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
@@ -1465,10 +1468,16 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
1465
1468
  # --- 7.3 Main extraction prompt ---------------------------------------------
1466
1469
  _SEQ_EXTRACTION_PROMPT = """
1467
1470
  Extract EVERY distinct enzyme-variant sequence you can find in the text.
1471
+
1472
+ IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
1473
+ - If an amino acid sequence exists for a variant, extract ONLY the aa_seq (set dna_seq to null)
1474
+ - Only extract dna_seq if NO amino acid sequence is available for that variant
1475
+ - This reduces redundancy since protein sequences are usually more relevant
1476
+
1468
1477
  For each variant return:
1469
1478
  * variant_id - the label used in the paper (e.g. "R4-10")
1470
1479
  * aa_seq - amino-acid sequence (uppercase), or null
1471
- * dna_seq - DNA sequence (A/C/G/T), or null
1480
+ * dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
1472
1481
 
1473
1482
  Respond ONLY with **minified JSON** that matches the schema below.
1474
1483
  NO markdown, no code fences, no commentary.
@@ -685,7 +685,7 @@ Ignore locations that contain data for other campaigns.
685
685
  'confidence': 95
686
686
  }
687
687
 
688
- def find_lineage_model_reaction(self, location: str, group_context: str) -> Dict[str, Any]:
688
+ def find_lineage_model_reaction(self, location: str, group_context: str, model_reaction_locations: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
689
689
  """Find the model reaction for a specific lineage group."""
690
690
  # Gather relevant text near this location
691
691
  page_text = self._page_with_reference(location) or ""
@@ -693,6 +693,7 @@ Ignore locations that contain data for other campaigns.
693
693
  # Also check manuscript introduction for model reaction info
694
694
  intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
695
695
 
696
+ # Build the prompt with location and context
696
697
  prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
697
698
  location=location,
698
699
  group_context=group_context
@@ -700,6 +701,22 @@ Ignore locations that contain data for other campaigns.
700
701
  prompt += f"\n\nText near {location}:\n{page_text[:3000]}"
701
702
  prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
702
703
 
704
+ # If we have model reaction locations, include text from those locations too
705
+ if model_reaction_locations:
706
+ # Add text from model reaction location
707
+ if model_reaction_locations.get("model_reaction_location", {}).get("location"):
708
+ model_loc = model_reaction_locations["model_reaction_location"]["location"]
709
+ model_text = self._get_text_around_location(model_loc)
710
+ if model_text:
711
+ prompt += f"\n\nText from {model_loc} (potential model reaction location):\n{model_text[:3000]}"
712
+
713
+ # Add text from conditions location (often contains reaction details)
714
+ if model_reaction_locations.get("conditions_location", {}).get("location"):
715
+ cond_loc = model_reaction_locations["conditions_location"]["location"]
716
+ cond_text = self._get_text_around_location(cond_loc)
717
+ if cond_text:
718
+ prompt += f"\n\nText from {cond_loc} (reaction conditions):\n{cond_text[:3000]}"
719
+
703
720
  try:
704
721
  data = generate_json_with_retry(
705
722
  self.model,
@@ -1038,7 +1055,20 @@ Different campaigns may use different model reactions.
1038
1055
  """Extract text around a given location identifier."""
1039
1056
  location_lower = location.lower()
1040
1057
 
1041
- # Search in all pages
1058
+ # Handle compound locations like "Figure 2 caption and Section I"
1059
+ # Extract the first figure/table/scheme reference
1060
+ figure_match = re.search(r"(figure|scheme|table)\s*\d+", location_lower)
1061
+ if figure_match:
1062
+ primary_location = figure_match.group(0)
1063
+ # Try to find this primary location first
1064
+ for page_text in self.all_pages:
1065
+ if primary_location in page_text.lower():
1066
+ idx = page_text.lower().index(primary_location)
1067
+ start = max(0, idx - 500)
1068
+ end = min(len(page_text), idx + 3000)
1069
+ return page_text[start:end]
1070
+
1071
+ # Search in all pages for exact match
1042
1072
  for page_text in self.all_pages:
1043
1073
  if location_lower in page_text.lower():
1044
1074
  # Find the location and extract context around it
@@ -1790,8 +1820,16 @@ TEXT FROM MANUSCRIPT:
1790
1820
  if location.get('caption'):
1791
1821
  location_context += f"\nCaption: {location['caption']}"
1792
1822
 
1793
- # Try to find model reaction for this specific lineage
1794
- location_model_reaction = self.find_lineage_model_reaction(location['location'], location_context)
1823
+ # First find model reaction locations for this campaign/enzyme group
1824
+ location_enzymes = df_location['enzyme'].unique().tolist()
1825
+ model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
1826
+
1827
+ # Try to find model reaction for this specific lineage, passing the locations
1828
+ location_model_reaction = self.find_lineage_model_reaction(
1829
+ location['location'],
1830
+ location_context,
1831
+ model_reaction_locations
1832
+ )
1795
1833
 
1796
1834
  # Get full model reaction info with IUPAC names
1797
1835
  if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
@@ -1799,7 +1837,6 @@ TEXT FROM MANUSCRIPT:
1799
1837
  else:
1800
1838
  # Fall back to general model reaction extraction
1801
1839
  # Pass the enzyme variants from this location
1802
- location_enzymes = df_location['enzyme'].unique().tolist()
1803
1840
  model_info = self.gather_model_reaction_info(location_enzymes)
1804
1841
 
1805
1842
  # Add model reaction info to all enzymes from this location
@@ -1891,7 +1928,16 @@ TEXT FROM MANUSCRIPT:
1891
1928
  if group.get('caption'):
1892
1929
  location_context += f"\nCaption: {group['caption']}"
1893
1930
 
1894
- location_model_reaction = self.find_lineage_model_reaction(group_location, location_context)
1931
+ # First find model reaction locations for this enzyme group
1932
+ location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
1933
+ model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
1934
+
1935
+ # Try to find model reaction for this specific lineage, passing the locations
1936
+ location_model_reaction = self.find_lineage_model_reaction(
1937
+ group_location,
1938
+ location_context,
1939
+ model_reaction_locations
1940
+ )
1895
1941
 
1896
1942
  # Get full model reaction info with IUPAC names
1897
1943
  if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
@@ -1899,7 +1945,6 @@ TEXT FROM MANUSCRIPT:
1899
1945
  else:
1900
1946
  # Try to extract model reaction from this specific location
1901
1947
  # Pass the enzyme variants that have data in this location
1902
- location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
1903
1948
  model_info = self.gather_model_reaction_info(location_enzymes)
1904
1949
 
1905
1950
  # Add model reaction info to all enzymes from this location
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -64,13 +64,6 @@ Enzyme lineage analysis and sequence extraction package with advanced parallel p
64
64
  ```bash
65
65
  pip install debase
66
66
  ```
67
-
68
- For full functionality with chemical SMILES support:
69
-
70
- ```bash
71
- pip install debase[rdkit]
72
- ```
73
-
74
67
  ## Requirements
75
68
 
76
69
  - Python 3.8 or higher
@@ -139,13 +132,6 @@ debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-ba
139
132
  debase --manuscript paper.pdf --si si.pdf # Default method
140
133
  ```
141
134
 
142
- ## Performance Comparison
143
-
144
- | Method | Total Time | API Calls | Accuracy | Best For |
145
- |--------|------------|-----------|----------|----------|
146
- | Sequential | ~45 min | 44 calls | Highest | Small datasets |
147
- | **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
148
- | Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
149
135
 
150
136
  ## Advanced Usage
151
137
 
@@ -169,31 +155,6 @@ python -m debase.substrate_scope_extractor_parallel \
169
155
  --manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
170
156
  --max-workers 5 --output substrate_scope.csv
171
157
  ```
172
-
173
- ## Python API
174
-
175
- ```python
176
- from debase.wrapper import run_pipeline
177
-
178
- # Run full pipeline with parallel processing
179
- run_pipeline(
180
- manuscript_path="paper.pdf",
181
- si_path="si.pdf",
182
- output="output.csv",
183
- use_parallel_individual=True,
184
- max_workers=5
185
- )
186
-
187
- # For individual steps
188
- from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
189
- from debase.enzyme_lineage_extractor import setup_gemini_api
190
-
191
- model = setup_gemini_api()
192
- reaction_data = extract_reaction_info_parallel(
193
- model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
194
- )
195
- ```
196
-
197
158
  ## Pipeline Architecture
198
159
 
199
160
  The DEBase pipeline consists of 5 main steps:
@@ -222,9 +183,6 @@ The DEBase pipeline consists of 5 main steps:
222
183
  - **External database integration:** Automatic sequence fetching from PDB and UniProt
223
184
  - **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
224
185
  - **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
225
- - **Progress tracking:** Real-time status updates
226
- - **Flexible output:** CSV format with comprehensive chemical and performance data
227
- - **Caching:** PDF encoding cache for improved performance
228
186
  - **Vision capabilities:** Extracts data from both text and images in PDFs
229
187
 
230
188
  ## Complete Command Reference
@@ -234,7 +192,6 @@ The DEBase pipeline consists of 5 main steps:
234
192
  --manuscript PATH # Required: Path to manuscript PDF
235
193
  --si PATH # Optional: Path to supplementary information PDF
236
194
  --output PATH # Output file path (default: manuscript_name_debase.csv)
237
- --queries N # Number of consensus queries (default: 2)
238
195
  ```
239
196
 
240
197
  ### Performance Options
@@ -279,21 +236,5 @@ The DEBase pipeline consists of 5 main steps:
279
236
  3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
280
237
  4. **Skip validation** (`--skip-validation`) for faster processing in production
281
238
  5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
282
- 6. **Check external databases** - Many sequences can be automatically fetched from PDB/UniProt
283
- 7. **Verify enzyme entries** - The system automatically filters out buffers and controls
284
-
285
- ## Troubleshooting
286
-
287
- ### No sequences found
288
- - The extractor will automatically search PDB and UniProt databases
289
- - Check the logs for which database IDs were found and attempted
290
- - Sequences with PDB structures will be fetched with high confidence
291
-
292
- ### Incorrect enzyme extraction
293
- - Non-enzyme entries (buffers, controls, media) are automatically filtered
294
- - Check the log for entries marked as "Filtering out non-enzyme entry"
239
+ 6.
295
240
 
296
- ### PDB matching issues
297
- - The system uses AI to match PDB IDs to specific enzyme variants
298
- - Increased context extraction ensures better matching accuracy
299
- - Check logs for "Gemini PDB matching" entries to see the matching process
@@ -1,8 +1,15 @@
1
+ .gitignore
2
+ CONTRIBUTING.md
1
3
  LICENSE
2
4
  MANIFEST.in
3
5
  README.md
6
+ environment.yml
4
7
  pyproject.toml
5
8
  setup.py
9
+ docs/README.md
10
+ docs/examples/README.md
11
+ src/__init__.py
12
+ src/debase/PIPELINE_FLOW.md
6
13
  src/debase/__init__.py
7
14
  src/debase/__main__.py
8
15
  src/debase/_version.py
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes