debase 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/PIPELINE_FLOW.md +100 -0
- debase/__init__.py +18 -0
- debase/__main__.py +9 -0
- debase/_version.py +3 -0
- debase/build_db.py +190 -0
- debase/cleanup_sequence.py +905 -0
- debase/enzyme_lineage_extractor.py +2169 -0
- debase/lineage_format.py +808 -0
- debase/reaction_info_extractor.py +2331 -0
- debase/substrate_scope_extractor.py +2039 -0
- debase/wrapper.py +303 -0
- debase-0.1.0.dist-info/METADATA +299 -0
- debase-0.1.0.dist-info/RECORD +17 -0
- debase-0.1.0.dist-info/WHEEL +5 -0
- debase-0.1.0.dist-info/entry_points.txt +2 -0
- debase-0.1.0.dist-info/licenses/LICENSE +21 -0
- debase-0.1.0.dist-info/top_level.txt +1 -0
debase/wrapper.py
ADDED
@@ -0,0 +1,303 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Enzyme Analysis Pipeline Wrapper (Clean Version)
|
4
|
+
|
5
|
+
Pipeline flow:
|
6
|
+
1. enzyme_lineage_extractor.py - Extract enzyme data from PDFs
|
7
|
+
2. cleanup_sequence.py - Clean and validate protein sequences
|
8
|
+
3. reaction_info_extractor.py - Extract reaction performance metrics
|
9
|
+
4. substrate_scope_extractor.py - Extract substrate scope data (runs independently)
|
10
|
+
5. lineage_format_o3.py - Format and merge all data into final CSV
|
11
|
+
|
12
|
+
The reaction_info and substrate_scope extractors run in parallel,
|
13
|
+
then their outputs are combined in lineage_format_o3.
|
14
|
+
"""
|
15
|
+
import os
|
16
|
+
import sys
|
17
|
+
import argparse
|
18
|
+
import logging
|
19
|
+
import time
|
20
|
+
from datetime import datetime
|
21
|
+
from pathlib import Path
|
22
|
+
|
23
|
+
# Setup logging
|
24
|
+
logging.basicConfig(
|
25
|
+
level=logging.INFO,
|
26
|
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
27
|
+
)
|
28
|
+
logger = logging.getLogger("EnzymePipeline")
|
29
|
+
|
30
|
+
|
31
|
+
def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir: Path = None) -> Path:
|
32
|
+
"""
|
33
|
+
Step 1: Extract enzyme lineage data from PDFs
|
34
|
+
Calls: enzyme_lineage_extractor.py
|
35
|
+
"""
|
36
|
+
logger.info(f"Extracting enzyme lineage from {manuscript.name}")
|
37
|
+
|
38
|
+
import sys
|
39
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
40
|
+
from src.debase.enzyme_lineage_extractor import run_pipeline
|
41
|
+
run_pipeline(manuscript=manuscript, si=si, output_csv=output, debug_dir=debug_dir)
|
42
|
+
|
43
|
+
logger.info(f"Lineage extraction complete: {output}")
|
44
|
+
return output
|
45
|
+
|
46
|
+
|
47
|
+
def run_sequence_cleanup(input_csv: Path, output_csv: Path) -> Path:
|
48
|
+
"""
|
49
|
+
Step 2: Clean and validate protein sequences
|
50
|
+
Calls: cleanup_sequence.py
|
51
|
+
"""
|
52
|
+
logger.info(f"Cleaning sequences from {input_csv.name}")
|
53
|
+
|
54
|
+
from src.debase.cleanup_sequence import main as cleanup_sequences
|
55
|
+
cleanup_sequences([str(input_csv), str(output_csv)])
|
56
|
+
|
57
|
+
logger.info(f"Sequence cleanup complete: {output_csv}")
|
58
|
+
return output_csv
|
59
|
+
|
60
|
+
|
61
|
+
def run_reaction_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
|
62
|
+
"""
|
63
|
+
Step 3a: Extract reaction performance metrics
|
64
|
+
Calls: reaction_info_extractor.py
|
65
|
+
"""
|
66
|
+
logger.info(f"Extracting reaction info for enzymes in {lineage_csv.name}")
|
67
|
+
|
68
|
+
from src.debase.reaction_info_extractor import ReactionExtractor, Config
|
69
|
+
import pandas as pd
|
70
|
+
|
71
|
+
# Load enzyme data
|
72
|
+
enzyme_df = pd.read_csv(lineage_csv)
|
73
|
+
|
74
|
+
# Initialize extractor and run
|
75
|
+
cfg = Config()
|
76
|
+
extractor = ReactionExtractor(manuscript, si, cfg, debug_dir=debug_dir)
|
77
|
+
df_metrics = extractor.run(enzyme_df)
|
78
|
+
|
79
|
+
# Save results
|
80
|
+
df_metrics.to_csv(output, index=False)
|
81
|
+
logger.info(f"Reaction extraction complete: {output}")
|
82
|
+
return output
|
83
|
+
|
84
|
+
|
85
|
+
def run_substrate_scope_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
|
86
|
+
"""
|
87
|
+
Step 3b: Extract substrate scope data (runs in parallel with reaction extraction)
|
88
|
+
Calls: substrate_scope_extractor.py
|
89
|
+
"""
|
90
|
+
logger.info(f"Extracting substrate scope for enzymes in {lineage_csv.name}")
|
91
|
+
|
92
|
+
from src.debase.substrate_scope_extractor import run_pipeline
|
93
|
+
|
94
|
+
# Run substrate scope extraction
|
95
|
+
run_pipeline(
|
96
|
+
manuscript=manuscript,
|
97
|
+
si=si,
|
98
|
+
lineage_csv=lineage_csv,
|
99
|
+
output_csv=output,
|
100
|
+
debug_dir=debug_dir
|
101
|
+
)
|
102
|
+
|
103
|
+
logger.info(f"Substrate scope extraction complete: {output}")
|
104
|
+
return output
|
105
|
+
|
106
|
+
|
107
|
+
def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_csv: Path, output_csv: Path) -> Path:
|
108
|
+
"""
|
109
|
+
Step 4: Format and merge all data into final CSV
|
110
|
+
Calls: lineage_format.py
|
111
|
+
"""
|
112
|
+
logger.info(f"Formatting and merging data into final output")
|
113
|
+
|
114
|
+
from src.debase.lineage_format import run_pipeline
|
115
|
+
import pandas as pd
|
116
|
+
|
117
|
+
# First, we need to merge the protein sequences into the reaction data
|
118
|
+
df_reaction = pd.read_csv(reaction_csv)
|
119
|
+
df_sequences = pd.read_csv(cleaned_csv)
|
120
|
+
|
121
|
+
# Merge sequences into reaction data
|
122
|
+
# Include generation and parent info for proper mutation calculation
|
123
|
+
sequence_cols = ['protein_sequence', 'dna_seq', 'seq_confidence', 'truncated', 'flag',
|
124
|
+
'generation', 'parent_enzyme_id', 'mutations']
|
125
|
+
sequence_data = df_sequences[['enzyme_id'] + [col for col in sequence_cols if col in df_sequences.columns]]
|
126
|
+
|
127
|
+
# Merge on enzyme_id or variant_id
|
128
|
+
if 'enzyme_id' in df_reaction.columns:
|
129
|
+
df_reaction = df_reaction.merge(sequence_data, on='enzyme_id', how='left', suffixes=('', '_seq'))
|
130
|
+
elif 'enzyme' in df_reaction.columns:
|
131
|
+
sequence_data = sequence_data.rename(columns={'enzyme_id': 'enzyme'})
|
132
|
+
df_reaction = df_reaction.merge(sequence_data, on='enzyme', how='left', suffixes=('', '_seq'))
|
133
|
+
|
134
|
+
# Save the merged reaction data
|
135
|
+
df_reaction.to_csv(reaction_csv, index=False)
|
136
|
+
|
137
|
+
# Run the formatting pipeline
|
138
|
+
df_final = run_pipeline(
|
139
|
+
reaction_csv=reaction_csv,
|
140
|
+
substrate_scope_csv=substrate_scope_csv,
|
141
|
+
output_csv=output_csv
|
142
|
+
)
|
143
|
+
|
144
|
+
logger.info(f"Final formatting complete: {output_csv}")
|
145
|
+
return output_csv
|
146
|
+
|
147
|
+
|
148
|
+
def run_pipeline(
|
149
|
+
manuscript_path: Path,
|
150
|
+
si_path: Path = None,
|
151
|
+
output_path: Path = None,
|
152
|
+
keep_intermediates: bool = False,
|
153
|
+
debug_dir: Path = None
|
154
|
+
) -> Path:
|
155
|
+
"""Run the complete enzyme analysis pipeline."""
|
156
|
+
# Setup paths
|
157
|
+
manuscript_path = Path(manuscript_path)
|
158
|
+
si_path = Path(si_path) if si_path else None
|
159
|
+
|
160
|
+
# Create output filename based on manuscript
|
161
|
+
if not output_path:
|
162
|
+
output_name = manuscript_path.stem.replace(' ', '_')
|
163
|
+
output_path = Path(f"{output_name}_debase.csv")
|
164
|
+
else:
|
165
|
+
output_path = Path(output_path)
|
166
|
+
|
167
|
+
# Use the output directory for all files
|
168
|
+
output_dir = output_path.parent
|
169
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
170
|
+
|
171
|
+
# Define intermediate file paths (all in the same directory as output)
|
172
|
+
lineage_csv = output_dir / "enzyme_lineage_data.csv" # This is what enzyme_lineage_extractor actually outputs
|
173
|
+
cleaned_csv = output_dir / "2_enzyme_sequences.csv"
|
174
|
+
reaction_csv = output_dir / "3a_reaction_info.csv"
|
175
|
+
substrate_csv = output_dir / "3b_substrate_scope.csv"
|
176
|
+
|
177
|
+
try:
|
178
|
+
logger.info("="*60)
|
179
|
+
logger.info("Starting DEBase Enzyme Analysis Pipeline")
|
180
|
+
logger.info(f"Manuscript: {manuscript_path}")
|
181
|
+
logger.info(f"SI: {si_path if si_path else 'None'}")
|
182
|
+
logger.info(f"Output: {output_path}")
|
183
|
+
logger.info("="*60)
|
184
|
+
|
185
|
+
start_time = time.time()
|
186
|
+
|
187
|
+
# Step 1: Extract enzyme lineage
|
188
|
+
logger.info("\n[Step 1/5] Extracting enzyme lineage...")
|
189
|
+
run_lineage_extraction(manuscript_path, si_path, lineage_csv, debug_dir=debug_dir)
|
190
|
+
|
191
|
+
# Step 2: Clean sequences
|
192
|
+
logger.info("\n[Step 2/5] Cleaning sequences...")
|
193
|
+
run_sequence_cleanup(lineage_csv, cleaned_csv)
|
194
|
+
|
195
|
+
# Step 3: Extract reaction and substrate scope in parallel
|
196
|
+
logger.info("\n[Step 3/5] Extracting reaction info and substrate scope...")
|
197
|
+
|
198
|
+
# Run reaction extraction
|
199
|
+
logger.info(" - Extracting reaction metrics...")
|
200
|
+
run_reaction_extraction(manuscript_path, si_path, cleaned_csv, reaction_csv, debug_dir=debug_dir)
|
201
|
+
|
202
|
+
# Add small delay to avoid API rate limits
|
203
|
+
time.sleep(2)
|
204
|
+
|
205
|
+
# Run substrate scope extraction
|
206
|
+
logger.info(" - Extracting substrate scope...")
|
207
|
+
run_substrate_scope_extraction(manuscript_path, si_path, cleaned_csv, substrate_csv, debug_dir=debug_dir)
|
208
|
+
|
209
|
+
# Step 4: Format and merge
|
210
|
+
logger.info("\n[Step 4/5] Formatting and merging data...")
|
211
|
+
run_lineage_format(reaction_csv, substrate_csv, cleaned_csv, output_path)
|
212
|
+
|
213
|
+
# Step 5: Finalize
|
214
|
+
logger.info("\n[Step 5/5] Finalizing...")
|
215
|
+
elapsed = time.time() - start_time
|
216
|
+
|
217
|
+
if keep_intermediates:
|
218
|
+
logger.info(f"All intermediate files saved in: {output_dir}")
|
219
|
+
else:
|
220
|
+
logger.info("Note: Use --keep-intermediates to save intermediate files")
|
221
|
+
|
222
|
+
logger.info("\n" + "="*60)
|
223
|
+
logger.info("PIPELINE COMPLETED SUCCESSFULLY")
|
224
|
+
logger.info(f"Output: {output_path}")
|
225
|
+
logger.info(f"Runtime: {elapsed:.1f} seconds")
|
226
|
+
logger.info("="*60)
|
227
|
+
|
228
|
+
return output_path
|
229
|
+
|
230
|
+
except Exception as e:
|
231
|
+
logger.error(f"Pipeline failed: {str(e)}")
|
232
|
+
raise
|
233
|
+
|
234
|
+
|
235
|
+
def main():
|
236
|
+
parser = argparse.ArgumentParser(
|
237
|
+
description='DEBase Enzyme Analysis Pipeline - Extract enzyme data from chemistry papers',
|
238
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
239
|
+
epilog="""
|
240
|
+
Pipeline steps:
|
241
|
+
1. enzyme_lineage_extractor - Extract enzyme variants from PDFs
|
242
|
+
2. cleanup_sequence - Validate and clean protein sequences
|
243
|
+
3. reaction_info_extractor - Extract reaction performance metrics
|
244
|
+
4. substrate_scope_extractor - Extract substrate scope data
|
245
|
+
5. lineage_format_o3 - Format and merge into final CSV
|
246
|
+
|
247
|
+
The pipeline automatically handles all steps sequentially.
|
248
|
+
"""
|
249
|
+
)
|
250
|
+
|
251
|
+
# Required arguments
|
252
|
+
parser.add_argument(
|
253
|
+
'--manuscript',
|
254
|
+
type=Path,
|
255
|
+
help='Path to manuscript PDF'
|
256
|
+
)
|
257
|
+
|
258
|
+
# Optional arguments
|
259
|
+
parser.add_argument(
|
260
|
+
'--si',
|
261
|
+
type=Path,
|
262
|
+
help='Path to supplementary information PDF'
|
263
|
+
)
|
264
|
+
parser.add_argument(
|
265
|
+
'--output',
|
266
|
+
type=Path,
|
267
|
+
help='Output CSV path (default: manuscript_name_debase.csv)'
|
268
|
+
)
|
269
|
+
parser.add_argument(
|
270
|
+
'--keep-intermediates',
|
271
|
+
action='store_true',
|
272
|
+
help='Keep intermediate files for debugging'
|
273
|
+
)
|
274
|
+
parser.add_argument(
|
275
|
+
'--debug-dir',
|
276
|
+
type=Path,
|
277
|
+
help='Directory for debug output (prompts, API responses)'
|
278
|
+
)
|
279
|
+
|
280
|
+
args = parser.parse_args()
|
281
|
+
|
282
|
+
# Check inputs
|
283
|
+
if not args.manuscript.exists():
|
284
|
+
parser.error(f"Manuscript not found: {args.manuscript}")
|
285
|
+
if args.si and not args.si.exists():
|
286
|
+
parser.error(f"SI not found: {args.si}")
|
287
|
+
|
288
|
+
# Run pipeline
|
289
|
+
try:
|
290
|
+
run_pipeline(
|
291
|
+
manuscript_path=args.manuscript,
|
292
|
+
si_path=args.si,
|
293
|
+
output_path=args.output,
|
294
|
+
keep_intermediates=args.keep_intermediates,
|
295
|
+
debug_dir=args.debug_dir
|
296
|
+
)
|
297
|
+
except Exception as e:
|
298
|
+
logger.error(f"Pipeline error: {e}")
|
299
|
+
sys.exit(1)
|
300
|
+
|
301
|
+
|
302
|
+
if __name__ == "__main__":
|
303
|
+
main()
|
@@ -0,0 +1,299 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: debase
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Enzyme lineage analysis and sequence extraction package
|
5
|
+
Home-page: https://github.com/YuemingLong/DEBase
|
6
|
+
Author: DEBase Team
|
7
|
+
Author-email: DEBase Team <ylong@caltech.edu>
|
8
|
+
License: MIT
|
9
|
+
Project-URL: Homepage, https://github.com/YuemingLong/DEBase
|
10
|
+
Project-URL: Documentation, https://github.com/YuemingLong/DEBase#readme
|
11
|
+
Project-URL: Repository, https://github.com/YuemingLong/DEBase
|
12
|
+
Project-URL: Issues, https://github.com/YuemingLong/DEBase/issues
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
16
|
+
Classifier: Operating System :: OS Independent
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
25
|
+
Requires-Python: >=3.8
|
26
|
+
Description-Content-Type: text/markdown
|
27
|
+
License-File: LICENSE
|
28
|
+
Requires-Dist: pandas>=1.0.0
|
29
|
+
Requires-Dist: PyMuPDF>=1.18.0
|
30
|
+
Requires-Dist: numpy>=1.19.0
|
31
|
+
Requires-Dist: google-generativeai>=0.3.0
|
32
|
+
Requires-Dist: biopython>=1.78
|
33
|
+
Requires-Dist: requests>=2.25.0
|
34
|
+
Requires-Dist: httpx>=0.24.0
|
35
|
+
Requires-Dist: tqdm>=4.60.0
|
36
|
+
Requires-Dist: openpyxl>=3.0.0
|
37
|
+
Requires-Dist: PyPDF2>=2.0.0
|
38
|
+
Requires-Dist: Pillow>=8.0.0
|
39
|
+
Requires-Dist: networkx>=2.5
|
40
|
+
Provides-Extra: rdkit
|
41
|
+
Requires-Dist: rdkit>=2020.03.1; extra == "rdkit"
|
42
|
+
Provides-Extra: dev
|
43
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
44
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
45
|
+
Requires-Dist: black; extra == "dev"
|
46
|
+
Requires-Dist: isort; extra == "dev"
|
47
|
+
Requires-Dist: flake8; extra == "dev"
|
48
|
+
Requires-Dist: mypy; extra == "dev"
|
49
|
+
Provides-Extra: docs
|
50
|
+
Requires-Dist: sphinx>=4.0; extra == "docs"
|
51
|
+
Requires-Dist: sphinx-rtd-theme; extra == "docs"
|
52
|
+
Requires-Dist: myst-parser; extra == "docs"
|
53
|
+
Dynamic: author
|
54
|
+
Dynamic: home-page
|
55
|
+
Dynamic: license-file
|
56
|
+
Dynamic: requires-python
|
57
|
+
|
58
|
+
# DEBase
|
59
|
+
|
60
|
+
Enzyme lineage analysis and sequence extraction package with advanced parallel processing capabilities.
|
61
|
+
|
62
|
+
## Installation
|
63
|
+
|
64
|
+
```bash
|
65
|
+
pip install debase
|
66
|
+
```
|
67
|
+
|
68
|
+
For full functionality with chemical SMILES support:
|
69
|
+
|
70
|
+
```bash
|
71
|
+
pip install debase[rdkit]
|
72
|
+
```
|
73
|
+
|
74
|
+
## Requirements
|
75
|
+
|
76
|
+
- Python 3.8 or higher
|
77
|
+
- A Gemini API key (set as environment variable `GEMINI_API_KEY`)
|
78
|
+
|
79
|
+
## Recent Updates
|
80
|
+
|
81
|
+
- **Campaign-Aware Extraction**: Automatically detects and processes multiple directed evolution campaigns in a single paper
|
82
|
+
- **Improved Model Support**: Updated to use stable Gemini models for better reliability
|
83
|
+
- **Enhanced PDB Integration**: Intelligent AI-based matching of PDB structures to enzyme variants
|
84
|
+
- **Better Filtering**: Automatic removal of non-enzyme entries (buffers, controls, media)
|
85
|
+
- **Optimized Performance**: Removed unnecessary rate limiting for faster processing
|
86
|
+
- **External Sequence Fetching**: Automatic retrieval from PDB and UniProt databases when sequences aren't in papers
|
87
|
+
- **Improved SI Processing**: Structure-aware extraction of supplementary information
|
88
|
+
- **Vision Support**: Extracts data from figures and tables using multimodal AI capabilities
|
89
|
+
|
90
|
+
## Quick Start
|
91
|
+
|
92
|
+
### Basic Usage
|
93
|
+
```bash
|
94
|
+
# Run the full pipeline (sequential processing)
|
95
|
+
debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv
|
96
|
+
```
|
97
|
+
|
98
|
+
### High-Performance Parallel Processing
|
99
|
+
```bash
|
100
|
+
# Use parallel individual processing for maximum speed + accuracy
|
101
|
+
debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv \
|
102
|
+
--use-parallel-individual --max-workers 5
|
103
|
+
|
104
|
+
# Use batch processing for maximum speed (slight accuracy trade-off)
|
105
|
+
debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv \
|
106
|
+
--use-optimized-reaction --reaction-batch-size 5
|
107
|
+
```
|
108
|
+
|
109
|
+
## Processing Methods
|
110
|
+
|
111
|
+
DEBase offers three processing approaches optimized for different use cases:
|
112
|
+
|
113
|
+
### 1. **Parallel Individual Processing** (Recommended)
|
114
|
+
- **42 individual API calls** (21 for reactions + 21 for substrate scope)
|
115
|
+
- **5 calls running simultaneously** for 4-5x speedup
|
116
|
+
- **Maximum accuracy** - each enzyme gets dedicated attention
|
117
|
+
- **Best for:** Production use, important analyses
|
118
|
+
|
119
|
+
```bash
|
120
|
+
debase --manuscript paper.pdf --si si.pdf --use-parallel-individual --max-workers 5
|
121
|
+
```
|
122
|
+
|
123
|
+
### 2. **Batch Processing** (Fastest)
|
124
|
+
- **~8 total API calls** (multiple enzymes per call)
|
125
|
+
- **Fastest processing** - up to 8x speedup
|
126
|
+
- **Good accuracy** - slight trade-off for complex chemical names
|
127
|
+
- **Best for:** Quick analyses, large-scale processing
|
128
|
+
|
129
|
+
```bash
|
130
|
+
debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-batch-size 5
|
131
|
+
```
|
132
|
+
|
133
|
+
### 3. **Sequential Processing** (Most Accurate)
|
134
|
+
- **42 sequential API calls** (one at a time)
|
135
|
+
- **Highest accuracy** but slowest
|
136
|
+
- **Best for:** Critical analyses, small datasets
|
137
|
+
|
138
|
+
```bash
|
139
|
+
debase --manuscript paper.pdf --si si.pdf # Default method
|
140
|
+
```
|
141
|
+
|
142
|
+
## Performance Comparison
|
143
|
+
|
144
|
+
| Method | Total Time | API Calls | Accuracy | Best For |
|
145
|
+
|--------|------------|-----------|----------|----------|
|
146
|
+
| Sequential | ~45 min | 44 calls | Highest | Small datasets |
|
147
|
+
| **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
|
148
|
+
| Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
|
149
|
+
|
150
|
+
## Advanced Usage
|
151
|
+
|
152
|
+
### Skip Steps with Existing Data
|
153
|
+
```bash
|
154
|
+
# Skip lineage extraction if you already have it
|
155
|
+
debase --manuscript paper.pdf --si si.pdf --output output.csv \
|
156
|
+
--skip-lineage --existing-lineage existing_lineage.csv \
|
157
|
+
--use-parallel-individual
|
158
|
+
```
|
159
|
+
|
160
|
+
### Direct Module Usage
|
161
|
+
```bash
|
162
|
+
# Run only reaction extraction with parallel processing
|
163
|
+
python -m debase.reaction_info_extractor_parallel \
|
164
|
+
--manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
|
165
|
+
--max-workers 5 --output reactions.csv
|
166
|
+
|
167
|
+
# Run only substrate scope extraction with parallel processing
|
168
|
+
python -m debase.substrate_scope_extractor_parallel \
|
169
|
+
--manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
|
170
|
+
--max-workers 5 --output substrate_scope.csv
|
171
|
+
```
|
172
|
+
|
173
|
+
## Python API
|
174
|
+
|
175
|
+
```python
|
176
|
+
from debase.wrapper import run_pipeline
|
177
|
+
|
178
|
+
# Run full pipeline with parallel processing
|
179
|
+
run_pipeline(
|
180
|
+
manuscript_path="paper.pdf",
|
181
|
+
si_path="si.pdf",
|
182
|
+
output="output.csv",
|
183
|
+
use_parallel_individual=True,
|
184
|
+
max_workers=5
|
185
|
+
)
|
186
|
+
|
187
|
+
# For individual steps
|
188
|
+
from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
|
189
|
+
from debase.enzyme_lineage_extractor import setup_gemini_api
|
190
|
+
|
191
|
+
model = setup_gemini_api()
|
192
|
+
reaction_data = extract_reaction_info_parallel(
|
193
|
+
model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
|
194
|
+
)
|
195
|
+
```
|
196
|
+
|
197
|
+
## Pipeline Architecture
|
198
|
+
|
199
|
+
The DEBase pipeline consists of 5 main steps:
|
200
|
+
|
201
|
+
1. **Lineage Extraction** (Sequential) - Identifies all enzymes and their relationships
|
202
|
+
- Extracts mutation information and evolutionary paths
|
203
|
+
- Detects multiple directed evolution campaigns automatically
|
204
|
+
- Fetches sequences from external databases (PDB, UniProt)
|
205
|
+
- Filters out non-enzyme entries automatically
|
206
|
+
2. **Sequence Cleanup** (Local) - Generates protein sequences from mutations
|
207
|
+
- Applies mutations to parent sequences
|
208
|
+
- Handles complex mutations and domain modifications
|
209
|
+
- Validates sequence integrity
|
210
|
+
3. **Reaction Extraction** (Parallel/Batch/Sequential) - Extracts reaction conditions and performance data
|
211
|
+
- Campaign-aware extraction for multi-lineage papers
|
212
|
+
- Vision-based extraction from figures and tables
|
213
|
+
- Automatic IUPAC name resolution
|
214
|
+
4. **Substrate Scope Extraction** (Parallel/Sequential) - Finds additional substrates tested
|
215
|
+
5. **Data Formatting** (Local) - Combines all data into final output
|
216
|
+
|
217
|
+
## Features
|
218
|
+
|
219
|
+
- **Multi-processing modes:** Sequential, parallel individual, and batch processing
|
220
|
+
- **Campaign detection:** Automatically identifies and separates multiple directed evolution campaigns
|
221
|
+
- **Intelligent error handling:** Automatic retries with exponential backoff
|
222
|
+
- **External database integration:** Automatic sequence fetching from PDB and UniProt
|
223
|
+
- **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
|
224
|
+
- **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
|
225
|
+
- **Progress tracking:** Real-time status updates
|
226
|
+
- **Flexible output:** CSV format with comprehensive chemical and performance data
|
227
|
+
- **Caching:** PDF encoding cache for improved performance
|
228
|
+
- **Vision capabilities:** Extracts data from both text and images in PDFs
|
229
|
+
|
230
|
+
## Complete Command Reference
|
231
|
+
|
232
|
+
### Core Arguments
|
233
|
+
```bash
|
234
|
+
--manuscript PATH # Required: Path to manuscript PDF
|
235
|
+
--si PATH # Optional: Path to supplementary information PDF
|
236
|
+
--output PATH # Output file path (default: manuscript_name_debase.csv)
|
237
|
+
--queries N # Number of consensus queries (default: 2)
|
238
|
+
```
|
239
|
+
|
240
|
+
### Performance Options
|
241
|
+
```bash
|
242
|
+
--use-parallel-individual # Use parallel processing (recommended)
|
243
|
+
--max-workers N # Number of parallel workers (default: 5)
|
244
|
+
--use-optimized-reaction # Use batch processing for speed
|
245
|
+
--reaction-batch-size N # Enzymes per batch (default: 5)
|
246
|
+
--no-parallel-queries # Disable parallel processing
|
247
|
+
```
|
248
|
+
|
249
|
+
### Pipeline Control
|
250
|
+
```bash
|
251
|
+
--skip-lineage # Skip lineage extraction step
|
252
|
+
--skip-sequence # Skip sequence cleanup step
|
253
|
+
--skip-reaction # Skip reaction extraction step
|
254
|
+
--skip-substrate-scope # Skip substrate scope extraction step
|
255
|
+
--skip-lineage-format # Skip final formatting step
|
256
|
+
--skip-validation # Skip data validation step
|
257
|
+
```
|
258
|
+
|
259
|
+
### Data Management
|
260
|
+
```bash
|
261
|
+
--existing-lineage PATH # Use existing lineage data
|
262
|
+
--existing-sequence PATH # Use existing sequence data
|
263
|
+
--existing-reaction PATH # Use existing reaction data
|
264
|
+
--keep-intermediates # Preserve intermediate files
|
265
|
+
```
|
266
|
+
|
267
|
+
### Advanced Options
|
268
|
+
```bash
|
269
|
+
--model-name NAME # Gemini model to use
|
270
|
+
--max-retries N # Maximum retry attempts (default: 2)
|
271
|
+
--max-chars N # Max characters from PDFs (default: 75000)
|
272
|
+
--debug-dir PATH # Directory for debug output (prompts, API responses)
|
273
|
+
```
|
274
|
+
|
275
|
+
## Tips for Best Performance
|
276
|
+
|
277
|
+
1. **Use parallel individual processing** for the best balance of speed and accuracy
|
278
|
+
2. **Set max-workers to 5** to avoid API rate limits while maximizing throughput
|
279
|
+
3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
|
280
|
+
4. **Skip validation** (`--skip-validation`) for faster processing in production
|
281
|
+
5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
|
282
|
+
6. **Check external databases** - Many sequences can be automatically fetched from PDB/UniProt
|
283
|
+
7. **Verify enzyme entries** - The system automatically filters out buffers and controls
|
284
|
+
|
285
|
+
## Troubleshooting
|
286
|
+
|
287
|
+
### No sequences found
|
288
|
+
- The extractor will automatically search PDB and UniProt databases
|
289
|
+
- Check the logs for which database IDs were found and attempted
|
290
|
+
- Sequences with PDB structures will be fetched with high confidence
|
291
|
+
|
292
|
+
### Incorrect enzyme extraction
|
293
|
+
- Non-enzyme entries (buffers, controls, media) are automatically filtered
|
294
|
+
- Check the log for entries marked as "Filtering out non-enzyme entry"
|
295
|
+
|
296
|
+
### PDB matching issues
|
297
|
+
- The system uses AI to match PDB IDs to specific enzyme variants
|
298
|
+
- Increased context extraction ensures better matching accuracy
|
299
|
+
- Check logs for "Gemini PDB matching" entries to see the matching process
|
@@ -0,0 +1,17 @@
|
|
1
|
+
debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
|
2
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
3
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
4
|
+
debase/_version.py,sha256=HnfC_TWAA2mfjIbkXT0ipZEqElS5wLaMzSj1DkE1F88,49
|
5
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
6
|
+
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
7
|
+
debase/enzyme_lineage_extractor.py,sha256=1GcgHA-lQPRf9-bNDlvQIP8p-KsP3D2WhIuOtCVJ_ME,87276
|
8
|
+
debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
|
9
|
+
debase/reaction_info_extractor.py,sha256=euw-4NHFuOPxpF99PJxTMLYYG0WryBDUCpoANB-SPPM,109655
|
10
|
+
debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
|
11
|
+
debase/wrapper.py,sha256=UlUBxxIXBnVtSIT9lZXkQeImlCABiUuof1CVZNKv9N4,10482
|
12
|
+
debase-0.1.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
13
|
+
debase-0.1.0.dist-info/METADATA,sha256=3s1NGPGYOb2bbP5PD5OoWBcJ7UeZ2OTQiOQ-SE5uqoM,11509
|
14
|
+
debase-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
debase-0.1.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
16
|
+
debase-0.1.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
17
|
+
debase-0.1.0.dist-info/RECORD,,
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 DEBase Contributors
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1 @@
|
|
1
|
+
debase
|