debase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/wrapper.py ADDED
@@ -0,0 +1,303 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enzyme Analysis Pipeline Wrapper (Clean Version)
4
+
5
+ Pipeline flow:
6
+ 1. enzyme_lineage_extractor.py - Extract enzyme data from PDFs
7
+ 2. cleanup_sequence.py - Clean and validate protein sequences
8
+ 3. reaction_info_extractor.py - Extract reaction performance metrics
9
+ 4. substrate_scope_extractor.py - Extract substrate scope data (runs independently)
10
+ 5. lineage_format_o3.py - Format and merge all data into final CSV
11
+
12
+ The reaction_info and substrate_scope extractors run in parallel,
13
+ then their outputs are combined in lineage_format_o3.
14
+ """
15
+ import os
16
+ import sys
17
+ import argparse
18
+ import logging
19
+ import time
20
+ from datetime import datetime
21
+ from pathlib import Path
22
+
23
+ # Setup logging
24
+ logging.basicConfig(
25
+ level=logging.INFO,
26
+ format='%(asctime)s - %(levelname)s - %(message)s'
27
+ )
28
+ logger = logging.getLogger("EnzymePipeline")
29
+
30
+
31
+ def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir: Path = None) -> Path:
32
+ """
33
+ Step 1: Extract enzyme lineage data from PDFs
34
+ Calls: enzyme_lineage_extractor.py
35
+ """
36
+ logger.info(f"Extracting enzyme lineage from {manuscript.name}")
37
+
38
+ import sys
39
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
40
+ from src.debase.enzyme_lineage_extractor import run_pipeline
41
+ run_pipeline(manuscript=manuscript, si=si, output_csv=output, debug_dir=debug_dir)
42
+
43
+ logger.info(f"Lineage extraction complete: {output}")
44
+ return output
45
+
46
+
47
+ def run_sequence_cleanup(input_csv: Path, output_csv: Path) -> Path:
48
+ """
49
+ Step 2: Clean and validate protein sequences
50
+ Calls: cleanup_sequence.py
51
+ """
52
+ logger.info(f"Cleaning sequences from {input_csv.name}")
53
+
54
+ from src.debase.cleanup_sequence import main as cleanup_sequences
55
+ cleanup_sequences([str(input_csv), str(output_csv)])
56
+
57
+ logger.info(f"Sequence cleanup complete: {output_csv}")
58
+ return output_csv
59
+
60
+
61
+ def run_reaction_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
62
+ """
63
+ Step 3a: Extract reaction performance metrics
64
+ Calls: reaction_info_extractor.py
65
+ """
66
+ logger.info(f"Extracting reaction info for enzymes in {lineage_csv.name}")
67
+
68
+ from src.debase.reaction_info_extractor import ReactionExtractor, Config
69
+ import pandas as pd
70
+
71
+ # Load enzyme data
72
+ enzyme_df = pd.read_csv(lineage_csv)
73
+
74
+ # Initialize extractor and run
75
+ cfg = Config()
76
+ extractor = ReactionExtractor(manuscript, si, cfg, debug_dir=debug_dir)
77
+ df_metrics = extractor.run(enzyme_df)
78
+
79
+ # Save results
80
+ df_metrics.to_csv(output, index=False)
81
+ logger.info(f"Reaction extraction complete: {output}")
82
+ return output
83
+
84
+
85
+ def run_substrate_scope_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
86
+ """
87
+ Step 3b: Extract substrate scope data (runs in parallel with reaction extraction)
88
+ Calls: substrate_scope_extractor.py
89
+ """
90
+ logger.info(f"Extracting substrate scope for enzymes in {lineage_csv.name}")
91
+
92
+ from src.debase.substrate_scope_extractor import run_pipeline
93
+
94
+ # Run substrate scope extraction
95
+ run_pipeline(
96
+ manuscript=manuscript,
97
+ si=si,
98
+ lineage_csv=lineage_csv,
99
+ output_csv=output,
100
+ debug_dir=debug_dir
101
+ )
102
+
103
+ logger.info(f"Substrate scope extraction complete: {output}")
104
+ return output
105
+
106
+
107
+ def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_csv: Path, output_csv: Path) -> Path:
108
+ """
109
+ Step 4: Format and merge all data into final CSV
110
+ Calls: lineage_format.py
111
+ """
112
+ logger.info(f"Formatting and merging data into final output")
113
+
114
+ from src.debase.lineage_format import run_pipeline
115
+ import pandas as pd
116
+
117
+ # First, we need to merge the protein sequences into the reaction data
118
+ df_reaction = pd.read_csv(reaction_csv)
119
+ df_sequences = pd.read_csv(cleaned_csv)
120
+
121
+ # Merge sequences into reaction data
122
+ # Include generation and parent info for proper mutation calculation
123
+ sequence_cols = ['protein_sequence', 'dna_seq', 'seq_confidence', 'truncated', 'flag',
124
+ 'generation', 'parent_enzyme_id', 'mutations']
125
+ sequence_data = df_sequences[['enzyme_id'] + [col for col in sequence_cols if col in df_sequences.columns]]
126
+
127
+ # Merge on enzyme_id or variant_id
128
+ if 'enzyme_id' in df_reaction.columns:
129
+ df_reaction = df_reaction.merge(sequence_data, on='enzyme_id', how='left', suffixes=('', '_seq'))
130
+ elif 'enzyme' in df_reaction.columns:
131
+ sequence_data = sequence_data.rename(columns={'enzyme_id': 'enzyme'})
132
+ df_reaction = df_reaction.merge(sequence_data, on='enzyme', how='left', suffixes=('', '_seq'))
133
+
134
+ # Save the merged reaction data
135
+ df_reaction.to_csv(reaction_csv, index=False)
136
+
137
+ # Run the formatting pipeline
138
+ df_final = run_pipeline(
139
+ reaction_csv=reaction_csv,
140
+ substrate_scope_csv=substrate_scope_csv,
141
+ output_csv=output_csv
142
+ )
143
+
144
+ logger.info(f"Final formatting complete: {output_csv}")
145
+ return output_csv
146
+
147
+
148
+ def run_pipeline(
149
+ manuscript_path: Path,
150
+ si_path: Path = None,
151
+ output_path: Path = None,
152
+ keep_intermediates: bool = False,
153
+ debug_dir: Path = None
154
+ ) -> Path:
155
+ """Run the complete enzyme analysis pipeline."""
156
+ # Setup paths
157
+ manuscript_path = Path(manuscript_path)
158
+ si_path = Path(si_path) if si_path else None
159
+
160
+ # Create output filename based on manuscript
161
+ if not output_path:
162
+ output_name = manuscript_path.stem.replace(' ', '_')
163
+ output_path = Path(f"{output_name}_debase.csv")
164
+ else:
165
+ output_path = Path(output_path)
166
+
167
+ # Use the output directory for all files
168
+ output_dir = output_path.parent
169
+ output_dir.mkdir(parents=True, exist_ok=True)
170
+
171
+ # Define intermediate file paths (all in the same directory as output)
172
+ lineage_csv = output_dir / "enzyme_lineage_data.csv" # This is what enzyme_lineage_extractor actually outputs
173
+ cleaned_csv = output_dir / "2_enzyme_sequences.csv"
174
+ reaction_csv = output_dir / "3a_reaction_info.csv"
175
+ substrate_csv = output_dir / "3b_substrate_scope.csv"
176
+
177
+ try:
178
+ logger.info("="*60)
179
+ logger.info("Starting DEBase Enzyme Analysis Pipeline")
180
+ logger.info(f"Manuscript: {manuscript_path}")
181
+ logger.info(f"SI: {si_path if si_path else 'None'}")
182
+ logger.info(f"Output: {output_path}")
183
+ logger.info("="*60)
184
+
185
+ start_time = time.time()
186
+
187
+ # Step 1: Extract enzyme lineage
188
+ logger.info("\n[Step 1/5] Extracting enzyme lineage...")
189
+ run_lineage_extraction(manuscript_path, si_path, lineage_csv, debug_dir=debug_dir)
190
+
191
+ # Step 2: Clean sequences
192
+ logger.info("\n[Step 2/5] Cleaning sequences...")
193
+ run_sequence_cleanup(lineage_csv, cleaned_csv)
194
+
195
+ # Step 3: Extract reaction and substrate scope in parallel
196
+ logger.info("\n[Step 3/5] Extracting reaction info and substrate scope...")
197
+
198
+ # Run reaction extraction
199
+ logger.info(" - Extracting reaction metrics...")
200
+ run_reaction_extraction(manuscript_path, si_path, cleaned_csv, reaction_csv, debug_dir=debug_dir)
201
+
202
+ # Add small delay to avoid API rate limits
203
+ time.sleep(2)
204
+
205
+ # Run substrate scope extraction
206
+ logger.info(" - Extracting substrate scope...")
207
+ run_substrate_scope_extraction(manuscript_path, si_path, cleaned_csv, substrate_csv, debug_dir=debug_dir)
208
+
209
+ # Step 4: Format and merge
210
+ logger.info("\n[Step 4/5] Formatting and merging data...")
211
+ run_lineage_format(reaction_csv, substrate_csv, cleaned_csv, output_path)
212
+
213
+ # Step 5: Finalize
214
+ logger.info("\n[Step 5/5] Finalizing...")
215
+ elapsed = time.time() - start_time
216
+
217
+ if keep_intermediates:
218
+ logger.info(f"All intermediate files saved in: {output_dir}")
219
+ else:
220
+ logger.info("Note: Use --keep-intermediates to save intermediate files")
221
+
222
+ logger.info("\n" + "="*60)
223
+ logger.info("PIPELINE COMPLETED SUCCESSFULLY")
224
+ logger.info(f"Output: {output_path}")
225
+ logger.info(f"Runtime: {elapsed:.1f} seconds")
226
+ logger.info("="*60)
227
+
228
+ return output_path
229
+
230
+ except Exception as e:
231
+ logger.error(f"Pipeline failed: {str(e)}")
232
+ raise
233
+
234
+
235
+ def main():
236
+ parser = argparse.ArgumentParser(
237
+ description='DEBase Enzyme Analysis Pipeline - Extract enzyme data from chemistry papers',
238
+ formatter_class=argparse.RawDescriptionHelpFormatter,
239
+ epilog="""
240
+ Pipeline steps:
241
+ 1. enzyme_lineage_extractor - Extract enzyme variants from PDFs
242
+ 2. cleanup_sequence - Validate and clean protein sequences
243
+ 3. reaction_info_extractor - Extract reaction performance metrics
244
+ 4. substrate_scope_extractor - Extract substrate scope data
245
+ 5. lineage_format_o3 - Format and merge into final CSV
246
+
247
+ The pipeline automatically handles all steps sequentially.
248
+ """
249
+ )
250
+
251
+ # Required arguments
252
+ parser.add_argument(
253
+ '--manuscript',
254
+ type=Path,
255
+ help='Path to manuscript PDF'
256
+ )
257
+
258
+ # Optional arguments
259
+ parser.add_argument(
260
+ '--si',
261
+ type=Path,
262
+ help='Path to supplementary information PDF'
263
+ )
264
+ parser.add_argument(
265
+ '--output',
266
+ type=Path,
267
+ help='Output CSV path (default: manuscript_name_debase.csv)'
268
+ )
269
+ parser.add_argument(
270
+ '--keep-intermediates',
271
+ action='store_true',
272
+ help='Keep intermediate files for debugging'
273
+ )
274
+ parser.add_argument(
275
+ '--debug-dir',
276
+ type=Path,
277
+ help='Directory for debug output (prompts, API responses)'
278
+ )
279
+
280
+ args = parser.parse_args()
281
+
282
+ # Check inputs
283
+ if not args.manuscript.exists():
284
+ parser.error(f"Manuscript not found: {args.manuscript}")
285
+ if args.si and not args.si.exists():
286
+ parser.error(f"SI not found: {args.si}")
287
+
288
+ # Run pipeline
289
+ try:
290
+ run_pipeline(
291
+ manuscript_path=args.manuscript,
292
+ si_path=args.si,
293
+ output_path=args.output,
294
+ keep_intermediates=args.keep_intermediates,
295
+ debug_dir=args.debug_dir
296
+ )
297
+ except Exception as e:
298
+ logger.error(f"Pipeline error: {e}")
299
+ sys.exit(1)
300
+
301
+
302
+ if __name__ == "__main__":
303
+ main()
@@ -0,0 +1,299 @@
1
+ Metadata-Version: 2.4
2
+ Name: debase
3
+ Version: 0.1.0
4
+ Summary: Enzyme lineage analysis and sequence extraction package
5
+ Home-page: https://github.com/YuemingLong/DEBase
6
+ Author: DEBase Team
7
+ Author-email: DEBase Team <ylong@caltech.edu>
8
+ License: MIT
9
+ Project-URL: Homepage, https://github.com/YuemingLong/DEBase
10
+ Project-URL: Documentation, https://github.com/YuemingLong/DEBase#readme
11
+ Project-URL: Repository, https://github.com/YuemingLong/DEBase
12
+ Project-URL: Issues, https://github.com/YuemingLong/DEBase/issues
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
24
+ Classifier: Topic :: Scientific/Engineering :: Chemistry
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: pandas>=1.0.0
29
+ Requires-Dist: PyMuPDF>=1.18.0
30
+ Requires-Dist: numpy>=1.19.0
31
+ Requires-Dist: google-generativeai>=0.3.0
32
+ Requires-Dist: biopython>=1.78
33
+ Requires-Dist: requests>=2.25.0
34
+ Requires-Dist: httpx>=0.24.0
35
+ Requires-Dist: tqdm>=4.60.0
36
+ Requires-Dist: openpyxl>=3.0.0
37
+ Requires-Dist: PyPDF2>=2.0.0
38
+ Requires-Dist: Pillow>=8.0.0
39
+ Requires-Dist: networkx>=2.5
40
+ Provides-Extra: rdkit
41
+ Requires-Dist: rdkit>=2020.03.1; extra == "rdkit"
42
+ Provides-Extra: dev
43
+ Requires-Dist: pytest>=6.0; extra == "dev"
44
+ Requires-Dist: pytest-cov; extra == "dev"
45
+ Requires-Dist: black; extra == "dev"
46
+ Requires-Dist: isort; extra == "dev"
47
+ Requires-Dist: flake8; extra == "dev"
48
+ Requires-Dist: mypy; extra == "dev"
49
+ Provides-Extra: docs
50
+ Requires-Dist: sphinx>=4.0; extra == "docs"
51
+ Requires-Dist: sphinx-rtd-theme; extra == "docs"
52
+ Requires-Dist: myst-parser; extra == "docs"
53
+ Dynamic: author
54
+ Dynamic: home-page
55
+ Dynamic: license-file
56
+ Dynamic: requires-python
57
+
58
+ # DEBase
59
+
60
+ Enzyme lineage analysis and sequence extraction package with advanced parallel processing capabilities.
61
+
62
+ ## Installation
63
+
64
+ ```bash
65
+ pip install debase
66
+ ```
67
+
68
+ For full functionality with chemical SMILES support:
69
+
70
+ ```bash
71
+ pip install debase[rdkit]
72
+ ```
73
+
74
+ ## Requirements
75
+
76
+ - Python 3.8 or higher
77
+ - A Gemini API key (set as environment variable `GEMINI_API_KEY`)
78
+
79
+ ## Recent Updates
80
+
81
+ - **Campaign-Aware Extraction**: Automatically detects and processes multiple directed evolution campaigns in a single paper
82
+ - **Improved Model Support**: Updated to use stable Gemini models for better reliability
83
+ - **Enhanced PDB Integration**: Intelligent AI-based matching of PDB structures to enzyme variants
84
+ - **Better Filtering**: Automatic removal of non-enzyme entries (buffers, controls, media)
85
+ - **Optimized Performance**: Removed unnecessary rate limiting for faster processing
86
+ - **External Sequence Fetching**: Automatic retrieval from PDB and UniProt databases when sequences aren't in papers
87
+ - **Improved SI Processing**: Structure-aware extraction of supplementary information
88
+ - **Vision Support**: Extracts data from figures and tables using multimodal AI capabilities
89
+
90
+ ## Quick Start
91
+
92
+ ### Basic Usage
93
+ ```bash
94
+ # Run the full pipeline (sequential processing)
95
+ debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv
96
+ ```
97
+
98
+ ### High-Performance Parallel Processing
99
+ ```bash
100
+ # Use parallel individual processing for maximum speed + accuracy
101
+ debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv \
102
+ --use-parallel-individual --max-workers 5
103
+
104
+ # Use batch processing for maximum speed (slight accuracy trade-off)
105
+ debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv \
106
+ --use-optimized-reaction --reaction-batch-size 5
107
+ ```
108
+
109
+ ## Processing Methods
110
+
111
+ DEBase offers three processing approaches optimized for different use cases:
112
+
113
+ ### 1. **Parallel Individual Processing** (Recommended)
114
+ - **42 individual API calls** (21 for reactions + 21 for substrate scope)
115
+ - **5 calls running simultaneously** for 4-5x speedup
116
+ - **Maximum accuracy** - each enzyme gets dedicated attention
117
+ - **Best for:** Production use, important analyses
118
+
119
+ ```bash
120
+ debase --manuscript paper.pdf --si si.pdf --use-parallel-individual --max-workers 5
121
+ ```
122
+
123
+ ### 2. **Batch Processing** (Fastest)
124
+ - **~8 total API calls** (multiple enzymes per call)
125
+ - **Fastest processing** - up to 8x speedup
126
+ - **Good accuracy** - slight trade-off for complex chemical names
127
+ - **Best for:** Quick analyses, large-scale processing
128
+
129
+ ```bash
130
+ debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-batch-size 5
131
+ ```
132
+
133
+ ### 3. **Sequential Processing** (Most Accurate)
134
+ - **42 sequential API calls** (one at a time)
135
+ - **Highest accuracy** but slowest
136
+ - **Best for:** Critical analyses, small datasets
137
+
138
+ ```bash
139
+ debase --manuscript paper.pdf --si si.pdf # Default method
140
+ ```
141
+
142
+ ## Performance Comparison
143
+
144
+ | Method | Total Time | API Calls | Accuracy | Best For |
145
+ |--------|------------|-----------|----------|----------|
146
+ | Sequential | ~45 min | 44 calls | Highest | Small datasets |
147
+ | **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
148
+ | Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
149
+
150
+ ## Advanced Usage
151
+
152
+ ### Skip Steps with Existing Data
153
+ ```bash
154
+ # Skip lineage extraction if you already have it
155
+ debase --manuscript paper.pdf --si si.pdf --output output.csv \
156
+ --skip-lineage --existing-lineage existing_lineage.csv \
157
+ --use-parallel-individual
158
+ ```
159
+
160
+ ### Direct Module Usage
161
+ ```bash
162
+ # Run only reaction extraction with parallel processing
163
+ python -m debase.reaction_info_extractor_parallel \
164
+ --manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
165
+ --max-workers 5 --output reactions.csv
166
+
167
+ # Run only substrate scope extraction with parallel processing
168
+ python -m debase.substrate_scope_extractor_parallel \
169
+ --manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
170
+ --max-workers 5 --output substrate_scope.csv
171
+ ```
172
+
173
+ ## Python API
174
+
175
+ ```python
176
+ from debase.wrapper import run_pipeline
177
+
178
+ # Run full pipeline with parallel processing
179
+ run_pipeline(
180
+ manuscript_path="paper.pdf",
181
+ si_path="si.pdf",
182
+ output="output.csv",
183
+ use_parallel_individual=True,
184
+ max_workers=5
185
+ )
186
+
187
+ # For individual steps
188
+ from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
189
+ from debase.enzyme_lineage_extractor import setup_gemini_api
190
+
191
+ model = setup_gemini_api()
192
+ reaction_data = extract_reaction_info_parallel(
193
+ model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
194
+ )
195
+ ```
196
+
197
+ ## Pipeline Architecture
198
+
199
+ The DEBase pipeline consists of 5 main steps:
200
+
201
+ 1. **Lineage Extraction** (Sequential) - Identifies all enzymes and their relationships
202
+ - Extracts mutation information and evolutionary paths
203
+ - Detects multiple directed evolution campaigns automatically
204
+ - Fetches sequences from external databases (PDB, UniProt)
205
+ - Filters out non-enzyme entries automatically
206
+ 2. **Sequence Cleanup** (Local) - Generates protein sequences from mutations
207
+ - Applies mutations to parent sequences
208
+ - Handles complex mutations and domain modifications
209
+ - Validates sequence integrity
210
+ 3. **Reaction Extraction** (Parallel/Batch/Sequential) - Extracts reaction conditions and performance data
211
+ - Campaign-aware extraction for multi-lineage papers
212
+ - Vision-based extraction from figures and tables
213
+ - Automatic IUPAC name resolution
214
+ 4. **Substrate Scope Extraction** (Parallel/Sequential) - Finds additional substrates tested
215
+ 5. **Data Formatting** (Local) - Combines all data into final output
216
+
217
+ ## Features
218
+
219
+ - **Multi-processing modes:** Sequential, parallel individual, and batch processing
220
+ - **Campaign detection:** Automatically identifies and separates multiple directed evolution campaigns
221
+ - **Intelligent error handling:** Automatic retries with exponential backoff
222
+ - **External database integration:** Automatic sequence fetching from PDB and UniProt
223
+ - **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
224
+ - **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
225
+ - **Progress tracking:** Real-time status updates
226
+ - **Flexible output:** CSV format with comprehensive chemical and performance data
227
+ - **Caching:** PDF encoding cache for improved performance
228
+ - **Vision capabilities:** Extracts data from both text and images in PDFs
229
+
230
+ ## Complete Command Reference
231
+
232
+ ### Core Arguments
233
+ ```bash
234
+ --manuscript PATH # Required: Path to manuscript PDF
235
+ --si PATH # Optional: Path to supplementary information PDF
236
+ --output PATH # Output file path (default: manuscript_name_debase.csv)
237
+ --queries N # Number of consensus queries (default: 2)
238
+ ```
239
+
240
+ ### Performance Options
241
+ ```bash
242
+ --use-parallel-individual # Use parallel processing (recommended)
243
+ --max-workers N # Number of parallel workers (default: 5)
244
+ --use-optimized-reaction # Use batch processing for speed
245
+ --reaction-batch-size N # Enzymes per batch (default: 5)
246
+ --no-parallel-queries # Disable parallel processing
247
+ ```
248
+
249
+ ### Pipeline Control
250
+ ```bash
251
+ --skip-lineage # Skip lineage extraction step
252
+ --skip-sequence # Skip sequence cleanup step
253
+ --skip-reaction # Skip reaction extraction step
254
+ --skip-substrate-scope # Skip substrate scope extraction step
255
+ --skip-lineage-format # Skip final formatting step
256
+ --skip-validation # Skip data validation step
257
+ ```
258
+
259
+ ### Data Management
260
+ ```bash
261
+ --existing-lineage PATH # Use existing lineage data
262
+ --existing-sequence PATH # Use existing sequence data
263
+ --existing-reaction PATH # Use existing reaction data
264
+ --keep-intermediates # Preserve intermediate files
265
+ ```
266
+
267
+ ### Advanced Options
268
+ ```bash
269
+ --model-name NAME # Gemini model to use
270
+ --max-retries N # Maximum retry attempts (default: 2)
271
+ --max-chars N # Max characters from PDFs (default: 75000)
272
+ --debug-dir PATH # Directory for debug output (prompts, API responses)
273
+ ```
274
+
275
+ ## Tips for Best Performance
276
+
277
+ 1. **Use parallel individual processing** for the best balance of speed and accuracy
278
+ 2. **Set max-workers to 5** to avoid API rate limits while maximizing throughput
279
+ 3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
280
+ 4. **Skip validation** (`--skip-validation`) for faster processing in production
281
+ 5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
282
+ 6. **Check external databases** - Many sequences can be automatically fetched from PDB/UniProt
283
+ 7. **Verify enzyme entries** - The system automatically filters out buffers and controls
284
+
285
+ ## Troubleshooting
286
+
287
+ ### No sequences found
288
+ - The extractor will automatically search PDB and UniProt databases
289
+ - Check the logs for which database IDs were found and attempted
290
+ - Sequences with PDB structures will be fetched with high confidence
291
+
292
+ ### Incorrect enzyme extraction
293
+ - Non-enzyme entries (buffers, controls, media) are automatically filtered
294
+ - Check the log for entries marked as "Filtering out non-enzyme entry"
295
+
296
+ ### PDB matching issues
297
+ - The system uses AI to match PDB IDs to specific enzyme variants
298
+ - Increased context extraction ensures better matching accuracy
299
+ - Check logs for "Gemini PDB matching" entries to see the matching process
@@ -0,0 +1,17 @@
1
+ debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
+ debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
+ debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
+ debase/_version.py,sha256=HnfC_TWAA2mfjIbkXT0ipZEqElS5wLaMzSj1DkE1F88,49
5
+ debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
+ debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
+ debase/enzyme_lineage_extractor.py,sha256=1GcgHA-lQPRf9-bNDlvQIP8p-KsP3D2WhIuOtCVJ_ME,87276
8
+ debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
9
+ debase/reaction_info_extractor.py,sha256=euw-4NHFuOPxpF99PJxTMLYYG0WryBDUCpoANB-SPPM,109655
10
+ debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
11
+ debase/wrapper.py,sha256=UlUBxxIXBnVtSIT9lZXkQeImlCABiUuof1CVZNKv9N4,10482
12
+ debase-0.1.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
+ debase-0.1.0.dist-info/METADATA,sha256=3s1NGPGYOb2bbP5PD5OoWBcJ7UeZ2OTQiOQ-SE5uqoM,11509
14
+ debase-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ debase-0.1.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
+ debase-0.1.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
+ debase-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ debase = debase.__main__:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 DEBase Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ debase