debase 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,177 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ *.py,cover
48
+ .hypothesis/
49
+ .pytest_cache/
50
+ cover/
51
+
52
+ # Jupyter Notebook
53
+ .ipynb_checkpoints
54
+
55
+ # IPython
56
+ profile_default/
57
+ ipython_config.py
58
+
59
+ # pyenv
60
+ .python-version
61
+
62
+ # pipenv
63
+ Pipfile.lock
64
+
65
+ # poetry
66
+ poetry.lock
67
+
68
+ # pdm
69
+ .pdm.toml
70
+
71
+ # PEP 582
72
+ __pypackages__/
73
+
74
+ # Celery stuff
75
+ celerybeat-schedule
76
+ celerybeat.pid
77
+
78
+ # SageMath parsed files
79
+ *.sage.py
80
+
81
+ # Environments
82
+ .env
83
+ .venv
84
+ env/
85
+ venv/
86
+ ENV/
87
+ env.bak/
88
+ venv.bak/
89
+
90
+ # Spyder project settings
91
+ .spyderproject
92
+ .spyproject
93
+
94
+ # Rope project settings
95
+ .ropeproject
96
+
97
+ # mkdocs documentation
98
+ /site
99
+
100
+ # mypy
101
+ .mypy_cache/
102
+ .dmypy.json
103
+ dmypy.json
104
+
105
+ # Pyre type checker
106
+ .pyre/
107
+
108
+ # pytype static type analyzer
109
+ .pytype/
110
+
111
+ # Cython debug symbols
112
+ cython_debug/
113
+
114
+ # PyCharm
115
+ .idea/
116
+
117
+ # VS Code
118
+ .vscode/
119
+
120
+ # macOS
121
+ .DS_Store
122
+ .AppleDouble
123
+ .LSOverride
124
+
125
+ # Windows
126
+ Thumbs.db
127
+ Thumbs.db:encryptable
128
+ ehthumbs.db
129
+ ehthumbs_vista.db
130
+ *.stackdump
131
+ [Dd]esktop.ini
132
+ $RECYCLE.BIN/
133
+ *.cab
134
+ *.msi
135
+ *.msix
136
+ *.msm
137
+ *.msp
138
+ *.lnk
139
+
140
+ # Linux
141
+ *~
142
+
143
+ # Temporary files
144
+ *.tmp
145
+ *.temp
146
+ *.log
147
+ .temp_*/
148
+ .cache/
149
+
150
+ # DEBase specific
151
+ enzyme_pipeline*.log
152
+ temp_merged_input.csv
153
+ *.egg-info/
154
+
155
+ # Project data and examples
156
+ data/
157
+ examples/
158
+ !examples/test.csv # Keep test.csv as example output
159
+
160
+ # Cache files
161
+ *.pkl
162
+ *_cache.pkl
163
+
164
+ # Large database files
165
+ *.db
166
+
167
+ # PDFs and Excel files
168
+ *.pdf
169
+ *.xlsx
170
+
171
+ # Backup files
172
+ *_backup.py
173
+ lineage_format_backup.py
174
+
175
+ # Temporary directories
176
+ .temp_*
177
+ enzyme_analysis_*
@@ -0,0 +1,61 @@
1
+ # Contributing to DEBase
2
+
3
+ Thank you for your interest in contributing to DEBase!
4
+
5
+ ## Development Setup
6
+
7
+ 1. Clone the repository:
8
+ ```bash
9
+ git clone https://github.com/yourusername/debase.git
10
+ cd debase
11
+ ```
12
+
13
+ 2. Create a virtual environment:
14
+ ```bash
15
+ python -m venv venv
16
+ source venv/bin/activate # On Windows: venv\Scripts\activate
17
+ ```
18
+
19
+ 3. Install in development mode:
20
+ ```bash
21
+ pip install -e ".[dev]"
22
+ ```
23
+
24
+ ## Running Tests
25
+
26
+ ```bash
27
+ pytest tests/
28
+ ```
29
+
30
+ ## Code Style
31
+
32
+ We use Black for code formatting:
33
+ ```bash
34
+ black src/ tests/
35
+ ```
36
+
37
+ And isort for import sorting:
38
+ ```bash
39
+ isort src/ tests/
40
+ ```
41
+
42
+ ## Project Structure
43
+
44
+ ```
45
+ debase/
46
+ ├── src/debase/ # Main package source code
47
+ ├── tests/ # Test suite
48
+ ├── docs/ # Documentation
49
+ ├── examples/ # Example outputs and usage
50
+ ├── data/ # Research data (PDFs)
51
+ └── scripts/ # Utility scripts
52
+ ```
53
+
54
+ ## Submitting Changes
55
+
56
+ 1. Fork the repository
57
+ 2. Create a feature branch
58
+ 3. Make your changes
59
+ 4. Add tests if applicable
60
+ 5. Run the test suite
61
+ 6. Submit a pull request
debase-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 DEBase Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,9 @@
1
+ include README.md
2
+ include LICENSE
3
+ include src/debase/_version.py
4
+ recursive-exclude * __pycache__
5
+ recursive-exclude * *.py[co]
6
+ recursive-exclude src/debase/Old *
7
+ recursive-exclude examples *
8
+ recursive-exclude tests *
9
+ prune src/debase/Old
debase-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,299 @@
1
+ Metadata-Version: 2.4
2
+ Name: debase
3
+ Version: 0.1.0
4
+ Summary: Enzyme lineage analysis and sequence extraction package
5
+ Home-page: https://github.com/YuemingLong/DEBase
6
+ Author: DEBase Team
7
+ Author-email: DEBase Team <ylong@caltech.edu>
8
+ License: MIT
9
+ Project-URL: Homepage, https://github.com/YuemingLong/DEBase
10
+ Project-URL: Documentation, https://github.com/YuemingLong/DEBase#readme
11
+ Project-URL: Repository, https://github.com/YuemingLong/DEBase
12
+ Project-URL: Issues, https://github.com/YuemingLong/DEBase/issues
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
24
+ Classifier: Topic :: Scientific/Engineering :: Chemistry
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: pandas>=1.0.0
29
+ Requires-Dist: PyMuPDF>=1.18.0
30
+ Requires-Dist: numpy>=1.19.0
31
+ Requires-Dist: google-generativeai>=0.3.0
32
+ Requires-Dist: biopython>=1.78
33
+ Requires-Dist: requests>=2.25.0
34
+ Requires-Dist: httpx>=0.24.0
35
+ Requires-Dist: tqdm>=4.60.0
36
+ Requires-Dist: openpyxl>=3.0.0
37
+ Requires-Dist: PyPDF2>=2.0.0
38
+ Requires-Dist: Pillow>=8.0.0
39
+ Requires-Dist: networkx>=2.5
40
+ Provides-Extra: rdkit
41
+ Requires-Dist: rdkit>=2020.03.1; extra == "rdkit"
42
+ Provides-Extra: dev
43
+ Requires-Dist: pytest>=6.0; extra == "dev"
44
+ Requires-Dist: pytest-cov; extra == "dev"
45
+ Requires-Dist: black; extra == "dev"
46
+ Requires-Dist: isort; extra == "dev"
47
+ Requires-Dist: flake8; extra == "dev"
48
+ Requires-Dist: mypy; extra == "dev"
49
+ Provides-Extra: docs
50
+ Requires-Dist: sphinx>=4.0; extra == "docs"
51
+ Requires-Dist: sphinx-rtd-theme; extra == "docs"
52
+ Requires-Dist: myst-parser; extra == "docs"
53
+ Dynamic: author
54
+ Dynamic: home-page
55
+ Dynamic: license-file
56
+ Dynamic: requires-python
57
+
58
+ # DEBase
59
+
60
+ Enzyme lineage analysis and sequence extraction package with advanced parallel processing capabilities.
61
+
62
+ ## Installation
63
+
64
+ ```bash
65
+ pip install debase
66
+ ```
67
+
68
+ For full functionality with chemical SMILES support:
69
+
70
+ ```bash
71
+ pip install debase[rdkit]
72
+ ```
73
+
74
+ ## Requirements
75
+
76
+ - Python 3.8 or higher
77
+ - A Gemini API key (set as environment variable `GEMINI_API_KEY`)
78
+
79
+ ## Recent Updates
80
+
81
+ - **Campaign-Aware Extraction**: Automatically detects and processes multiple directed evolution campaigns in a single paper
82
+ - **Improved Model Support**: Updated to use stable Gemini models for better reliability
83
+ - **Enhanced PDB Integration**: Intelligent AI-based matching of PDB structures to enzyme variants
84
+ - **Better Filtering**: Automatic removal of non-enzyme entries (buffers, controls, media)
85
+ - **Optimized Performance**: Removed unnecessary rate limiting for faster processing
86
+ - **External Sequence Fetching**: Automatic retrieval from PDB and UniProt databases when sequences aren't in papers
87
+ - **Improved SI Processing**: Structure-aware extraction of supplementary information
88
+ - **Vision Support**: Extracts data from figures and tables using multimodal AI capabilities
89
+
90
+ ## Quick Start
91
+
92
+ ### Basic Usage
93
+ ```bash
94
+ # Run the full pipeline (sequential processing)
95
+ debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv
96
+ ```
97
+
98
+ ### High-Performance Parallel Processing
99
+ ```bash
100
+ # Use parallel individual processing for maximum speed + accuracy
101
+ debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv \
102
+ --use-parallel-individual --max-workers 5
103
+
104
+ # Use batch processing for maximum speed (slight accuracy trade-off)
105
+ debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv \
106
+ --use-optimized-reaction --reaction-batch-size 5
107
+ ```
108
+
109
+ ## Processing Methods
110
+
111
+ DEBase offers three processing approaches optimized for different use cases:
112
+
113
+ ### 1. **Parallel Individual Processing** (Recommended)
114
+ - **42 individual API calls** (21 for reactions + 21 for substrate scope)
115
+ - **5 calls running simultaneously** for 4-5x speedup
116
+ - **Maximum accuracy** - each enzyme gets dedicated attention
117
+ - **Best for:** Production use, important analyses
118
+
119
+ ```bash
120
+ debase --manuscript paper.pdf --si si.pdf --use-parallel-individual --max-workers 5
121
+ ```
122
+
123
+ ### 2. **Batch Processing** (Fastest)
124
+ - **~8 total API calls** (multiple enzymes per call)
125
+ - **Fastest processing** - up to 8x speedup
126
+ - **Good accuracy** - slight trade-off for complex chemical names
127
+ - **Best for:** Quick analyses, large-scale processing
128
+
129
+ ```bash
130
+ debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-batch-size 5
131
+ ```
132
+
133
+ ### 3. **Sequential Processing** (Most Accurate)
134
+ - **42 sequential API calls** (one at a time)
135
+ - **Highest accuracy** but slowest
136
+ - **Best for:** Critical analyses, small datasets
137
+
138
+ ```bash
139
+ debase --manuscript paper.pdf --si si.pdf # Default method
140
+ ```
141
+
142
+ ## Performance Comparison
143
+
144
+ | Method | Total Time | API Calls | Accuracy | Best For |
145
+ |--------|------------|-----------|----------|----------|
146
+ | Sequential | ~45 min | 44 calls | Highest | Small datasets |
147
+ | **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
148
+ | Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
149
+
150
+ ## Advanced Usage
151
+
152
+ ### Skip Steps with Existing Data
153
+ ```bash
154
+ # Skip lineage extraction if you already have it
155
+ debase --manuscript paper.pdf --si si.pdf --output output.csv \
156
+ --skip-lineage --existing-lineage existing_lineage.csv \
157
+ --use-parallel-individual
158
+ ```
159
+
160
+ ### Direct Module Usage
161
+ ```bash
162
+ # Run only reaction extraction with parallel processing
163
+ python -m debase.reaction_info_extractor_parallel \
164
+ --manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
165
+ --max-workers 5 --output reactions.csv
166
+
167
+ # Run only substrate scope extraction with parallel processing
168
+ python -m debase.substrate_scope_extractor_parallel \
169
+ --manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
170
+ --max-workers 5 --output substrate_scope.csv
171
+ ```
172
+
173
+ ## Python API
174
+
175
+ ```python
176
+ from debase.wrapper import run_pipeline
177
+
178
+ # Run full pipeline with parallel processing
179
+ run_pipeline(
180
+ manuscript_path="paper.pdf",
181
+ si_path="si.pdf",
182
+ output="output.csv",
183
+ use_parallel_individual=True,
184
+ max_workers=5
185
+ )
186
+
187
+ # For individual steps
188
+ from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
189
+ from debase.enzyme_lineage_extractor import setup_gemini_api
190
+
191
+ model = setup_gemini_api()
192
+ reaction_data = extract_reaction_info_parallel(
193
+ model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
194
+ )
195
+ ```
196
+
197
+ ## Pipeline Architecture
198
+
199
+ The DEBase pipeline consists of 5 main steps:
200
+
201
+ 1. **Lineage Extraction** (Sequential) - Identifies all enzymes and their relationships
202
+ - Extracts mutation information and evolutionary paths
203
+ - Detects multiple directed evolution campaigns automatically
204
+ - Fetches sequences from external databases (PDB, UniProt)
205
+ - Filters out non-enzyme entries automatically
206
+ 2. **Sequence Cleanup** (Local) - Generates protein sequences from mutations
207
+ - Applies mutations to parent sequences
208
+ - Handles complex mutations and domain modifications
209
+ - Validates sequence integrity
210
+ 3. **Reaction Extraction** (Parallel/Batch/Sequential) - Extracts reaction conditions and performance data
211
+ - Campaign-aware extraction for multi-lineage papers
212
+ - Vision-based extraction from figures and tables
213
+ - Automatic IUPAC name resolution
214
+ 4. **Substrate Scope Extraction** (Parallel/Sequential) - Finds additional substrates tested
215
+ 5. **Data Formatting** (Local) - Combines all data into final output
216
+
217
+ ## Features
218
+
219
+ - **Multi-processing modes:** Sequential, parallel individual, and batch processing
220
+ - **Campaign detection:** Automatically identifies and separates multiple directed evolution campaigns
221
+ - **Intelligent error handling:** Automatic retries with exponential backoff
222
+ - **External database integration:** Automatic sequence fetching from PDB and UniProt
223
+ - **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
224
+ - **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
225
+ - **Progress tracking:** Real-time status updates
226
+ - **Flexible output:** CSV format with comprehensive chemical and performance data
227
+ - **Caching:** PDF encoding cache for improved performance
228
+ - **Vision capabilities:** Extracts data from both text and images in PDFs
229
+
230
+ ## Complete Command Reference
231
+
232
+ ### Core Arguments
233
+ ```bash
234
+ --manuscript PATH # Required: Path to manuscript PDF
235
+ --si PATH # Optional: Path to supplementary information PDF
236
+ --output PATH # Output file path (default: manuscript_name_debase.csv)
237
+ --queries N # Number of consensus queries (default: 2)
238
+ ```
239
+
240
+ ### Performance Options
241
+ ```bash
242
+ --use-parallel-individual # Use parallel processing (recommended)
243
+ --max-workers N # Number of parallel workers (default: 5)
244
+ --use-optimized-reaction # Use batch processing for speed
245
+ --reaction-batch-size N # Enzymes per batch (default: 5)
246
+ --no-parallel-queries # Disable parallel processing
247
+ ```
248
+
249
+ ### Pipeline Control
250
+ ```bash
251
+ --skip-lineage # Skip lineage extraction step
252
+ --skip-sequence # Skip sequence cleanup step
253
+ --skip-reaction # Skip reaction extraction step
254
+ --skip-substrate-scope # Skip substrate scope extraction step
255
+ --skip-lineage-format # Skip final formatting step
256
+ --skip-validation # Skip data validation step
257
+ ```
258
+
259
+ ### Data Management
260
+ ```bash
261
+ --existing-lineage PATH # Use existing lineage data
262
+ --existing-sequence PATH # Use existing sequence data
263
+ --existing-reaction PATH # Use existing reaction data
264
+ --keep-intermediates # Preserve intermediate files
265
+ ```
266
+
267
+ ### Advanced Options
268
+ ```bash
269
+ --model-name NAME # Gemini model to use
270
+ --max-retries N # Maximum retry attempts (default: 2)
271
+ --max-chars N # Max characters from PDFs (default: 75000)
272
+ --debug-dir PATH # Directory for debug output (prompts, API responses)
273
+ ```
274
+
275
+ ## Tips for Best Performance
276
+
277
+ 1. **Use parallel individual processing** for the best balance of speed and accuracy
278
+ 2. **Set max-workers to 5** to avoid API rate limits while maximizing throughput
279
+ 3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
280
+ 4. **Skip validation** (`--skip-validation`) for faster processing in production
281
+ 5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
282
+ 6. **Check external databases** - Many sequences can be automatically fetched from PDB/UniProt
283
+ 7. **Verify enzyme entries** - The system automatically filters out buffers and controls
284
+
285
+ ## Troubleshooting
286
+
287
+ ### No sequences found
288
+ - The extractor will automatically search PDB and UniProt databases
289
+ - Check the logs for which database IDs were found and attempted
290
+ - Sequences with PDB structures will be fetched with high confidence
291
+
292
+ ### Incorrect enzyme extraction
293
+ - Non-enzyme entries (buffers, controls, media) are automatically filtered
294
+ - Check the log for entries marked as "Filtering out non-enzyme entry"
295
+
296
+ ### PDB matching issues
297
+ - The system uses AI to match PDB IDs to specific enzyme variants
298
+ - Increased context extraction ensures better matching accuracy
299
+ - Check logs for "Gemini PDB matching" entries to see the matching process