debase 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +512 -33
- debase/enzyme_lineage_extractor.py +985 -100
- debase/lineage_format.py +226 -13
- debase/reaction_info_extractor.py +178 -34
- debase/substrate_scope_extractor.py +52 -4
- debase/wrapper.py +155 -151
- debase-0.4.5.dist-info/METADATA +121 -0
- debase-0.4.5.dist-info/RECORD +16 -0
- debase-0.4.3.dist-info/METADATA +0 -296
- debase-0.4.3.dist-info/RECORD +0 -16
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/WHEEL +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/entry_points.txt +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/top_level.txt +0 -0
debase-0.4.3.dist-info/METADATA
DELETED
@@ -1,296 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: debase
|
3
|
-
Version: 0.4.3
|
4
|
-
Summary: Enzyme lineage analysis and sequence extraction package
|
5
|
-
Home-page: https://github.com/YuemingLong/DEBase
|
6
|
-
Author: DEBase Team
|
7
|
-
Author-email: DEBase Team <ylong@caltech.edu>
|
8
|
-
License: MIT
|
9
|
-
Project-URL: Homepage, https://github.com/YuemingLong/DEBase
|
10
|
-
Project-URL: Documentation, https://github.com/YuemingLong/DEBase#readme
|
11
|
-
Project-URL: Repository, https://github.com/YuemingLong/DEBase
|
12
|
-
Project-URL: Issues, https://github.com/YuemingLong/DEBase/issues
|
13
|
-
Classifier: Development Status :: 4 - Beta
|
14
|
-
Classifier: Intended Audience :: Science/Research
|
15
|
-
Classifier: License :: OSI Approved :: MIT License
|
16
|
-
Classifier: Operating System :: OS Independent
|
17
|
-
Classifier: Programming Language :: Python :: 3
|
18
|
-
Classifier: Programming Language :: Python :: 3.8
|
19
|
-
Classifier: Programming Language :: Python :: 3.9
|
20
|
-
Classifier: Programming Language :: Python :: 3.10
|
21
|
-
Classifier: Programming Language :: Python :: 3.11
|
22
|
-
Classifier: Programming Language :: Python :: 3.12
|
23
|
-
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
24
|
-
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
25
|
-
Requires-Python: >=3.8
|
26
|
-
Description-Content-Type: text/markdown
|
27
|
-
License-File: LICENSE
|
28
|
-
Requires-Dist: pandas>=1.0.0
|
29
|
-
Requires-Dist: PyMuPDF>=1.18.0
|
30
|
-
Requires-Dist: numpy>=1.19.0
|
31
|
-
Requires-Dist: google-generativeai>=0.3.0
|
32
|
-
Requires-Dist: biopython>=1.78
|
33
|
-
Requires-Dist: requests>=2.25.0
|
34
|
-
Requires-Dist: httpx>=0.24.0
|
35
|
-
Requires-Dist: tqdm>=4.60.0
|
36
|
-
Requires-Dist: openpyxl>=3.0.0
|
37
|
-
Requires-Dist: PyPDF2>=2.0.0
|
38
|
-
Requires-Dist: Pillow>=8.0.0
|
39
|
-
Requires-Dist: networkx>=2.5
|
40
|
-
Provides-Extra: rdkit
|
41
|
-
Requires-Dist: rdkit>=2020.03.1; extra == "rdkit"
|
42
|
-
Provides-Extra: dev
|
43
|
-
Requires-Dist: pytest>=6.0; extra == "dev"
|
44
|
-
Requires-Dist: pytest-cov; extra == "dev"
|
45
|
-
Requires-Dist: black; extra == "dev"
|
46
|
-
Requires-Dist: isort; extra == "dev"
|
47
|
-
Requires-Dist: flake8; extra == "dev"
|
48
|
-
Requires-Dist: mypy; extra == "dev"
|
49
|
-
Provides-Extra: docs
|
50
|
-
Requires-Dist: sphinx>=4.0; extra == "docs"
|
51
|
-
Requires-Dist: sphinx-rtd-theme; extra == "docs"
|
52
|
-
Requires-Dist: myst-parser; extra == "docs"
|
53
|
-
Dynamic: author
|
54
|
-
Dynamic: home-page
|
55
|
-
Dynamic: license-file
|
56
|
-
Dynamic: requires-python
|
57
|
-
|
58
|
-
# DEBase
|
59
|
-
|
60
|
-
Enzyme lineage analysis and sequence extraction package with advanced parallel processing capabilities.
|
61
|
-
|
62
|
-
## Installation
|
63
|
-
|
64
|
-
### Quick Install (PyPI)
|
65
|
-
```bash
|
66
|
-
pip install debase
|
67
|
-
```
|
68
|
-
|
69
|
-
### Development Setup with Conda (Recommended)
|
70
|
-
|
71
|
-
1. **Clone the repository**
|
72
|
-
```bash
|
73
|
-
git clone https://github.com/YuemingLong/DEBase.git
|
74
|
-
cd DEBase
|
75
|
-
```
|
76
|
-
|
77
|
-
2. **Create conda environment from provided file**
|
78
|
-
```bash
|
79
|
-
conda env create -f environment.yml
|
80
|
-
conda activate debase
|
81
|
-
```
|
82
|
-
|
83
|
-
3. **Install DEBase in development mode**
|
84
|
-
```bash
|
85
|
-
pip install -e .
|
86
|
-
```
|
87
|
-
|
88
|
-
### Manual Setup
|
89
|
-
|
90
|
-
If you prefer to set up the environment manually:
|
91
|
-
|
92
|
-
```bash
|
93
|
-
# Create new conda environment
|
94
|
-
conda create -n debase python=3.9
|
95
|
-
conda activate debase
|
96
|
-
|
97
|
-
# Install conda packages
|
98
|
-
conda install -c conda-forge pandas numpy matplotlib seaborn jupyter jupyterlab openpyxl biopython requests tqdm
|
99
|
-
|
100
|
-
# Install RDKit (optional - used for SMILES canonicalization)
|
101
|
-
conda install -c conda-forge rdkit
|
102
|
-
|
103
|
-
# Install pip-only packages
|
104
|
-
pip install PyMuPDF google-generativeai debase
|
105
|
-
```
|
106
|
-
|
107
|
-
**Note about RDKit**: RDKit is optional and only used for canonicalizing SMILES strings in the output. If not installed, DEBase will still function normally but SMILES strings won't be standardized.
|
108
|
-
|
109
|
-
## Requirements
|
110
|
-
|
111
|
-
- Python 3.8 or higher
|
112
|
-
- A Gemini API key (set as environment variable `GEMINI_API_KEY`)
|
113
|
-
|
114
|
-
### Setting up Gemini API Key
|
115
|
-
|
116
|
-
```bash
|
117
|
-
# Option 1: Export in your shell
|
118
|
-
export GEMINI_API_KEY="your-api-key-here"
|
119
|
-
|
120
|
-
# Option 2: Add to ~/.bashrc or ~/.zshrc for persistence
|
121
|
-
echo 'export GEMINI_API_KEY="your-api-key-here"' >> ~/.bashrc
|
122
|
-
source ~/.bashrc
|
123
|
-
|
124
|
-
# Option 3: Create .env file in project directory
|
125
|
-
echo 'GEMINI_API_KEY=your-api-key-here' > .env
|
126
|
-
```
|
127
|
-
|
128
|
-
## Recent Updates
|
129
|
-
|
130
|
-
- **Campaign-Aware Extraction**: Automatically detects and processes multiple directed evolution campaigns in a single paper
|
131
|
-
- **Improved Model Support**: Updated to use stable Gemini models for better reliability
|
132
|
-
- **Enhanced PDB Integration**: Intelligent AI-based matching of PDB structures to enzyme variants
|
133
|
-
- **Better Filtering**: Automatic removal of non-enzyme entries (buffers, controls, media)
|
134
|
-
- **Optimized Performance**: Removed unnecessary rate limiting for faster processing
|
135
|
-
- **External Sequence Fetching**: Automatic retrieval from PDB and UniProt databases when sequences aren't in papers
|
136
|
-
- **Improved SI Processing**: Structure-aware extraction of supplementary information
|
137
|
-
- **Vision Support**: Extracts data from figures and tables using multimodal AI capabilities
|
138
|
-
|
139
|
-
## Quick Start
|
140
|
-
|
141
|
-
### Basic Usage
|
142
|
-
```bash
|
143
|
-
# Run the full pipeline (sequential processing)
|
144
|
-
debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv
|
145
|
-
```
|
146
|
-
|
147
|
-
### High-Performance Parallel Processing
|
148
|
-
```bash
|
149
|
-
# Use parallel individual processing for maximum speed + accuracy
|
150
|
-
debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv \
|
151
|
-
--use-parallel-individual --max-workers 5
|
152
|
-
|
153
|
-
# Use batch processing for maximum speed (slight accuracy trade-off)
|
154
|
-
debase --manuscript manuscript.pdf --si supplementary.pdf --output output.csv \
|
155
|
-
--use-optimized-reaction --reaction-batch-size 5
|
156
|
-
```
|
157
|
-
|
158
|
-
## Processing Methods
|
159
|
-
|
160
|
-
DEBase offers three processing approaches optimized for different use cases:
|
161
|
-
|
162
|
-
### 1. **Parallel Individual Processing** (Recommended)
|
163
|
-
- **42 individual API calls** (21 for reactions + 21 for substrate scope)
|
164
|
-
- **5 calls running simultaneously** for 4-5x speedup
|
165
|
-
- **Maximum accuracy** - each enzyme gets dedicated attention
|
166
|
-
- **Best for:** Production use, important analyses
|
167
|
-
|
168
|
-
```bash
|
169
|
-
debase --manuscript paper.pdf --si si.pdf --use-parallel-individual --max-workers 5
|
170
|
-
```
|
171
|
-
|
172
|
-
### 2. **Batch Processing** (Fastest)
|
173
|
-
- **~8 total API calls** (multiple enzymes per call)
|
174
|
-
- **Fastest processing** - up to 8x speedup
|
175
|
-
- **Good accuracy** - slight trade-off for complex chemical names
|
176
|
-
- **Best for:** Quick analyses, large-scale processing
|
177
|
-
|
178
|
-
```bash
|
179
|
-
debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-batch-size 5
|
180
|
-
```
|
181
|
-
|
182
|
-
### 3. **Sequential Processing** (Most Accurate)
|
183
|
-
- **42 sequential API calls** (one at a time)
|
184
|
-
- **Highest accuracy** but slowest
|
185
|
-
- **Best for:** Critical analyses, small datasets
|
186
|
-
|
187
|
-
```bash
|
188
|
-
debase --manuscript paper.pdf --si si.pdf # Default method
|
189
|
-
```
|
190
|
-
|
191
|
-
|
192
|
-
## Advanced Usage
|
193
|
-
|
194
|
-
### Skip Steps with Existing Data
|
195
|
-
```bash
|
196
|
-
# Skip lineage extraction if you already have it
|
197
|
-
debase --manuscript paper.pdf --si si.pdf --output output.csv \
|
198
|
-
--skip-lineage --existing-lineage existing_lineage.csv \
|
199
|
-
--use-parallel-individual
|
200
|
-
```
|
201
|
-
|
202
|
-
### Direct Module Usage
|
203
|
-
```bash
|
204
|
-
# Run only reaction extraction with parallel processing
|
205
|
-
python -m debase.reaction_info_extractor_parallel \
|
206
|
-
--manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
|
207
|
-
--max-workers 5 --output reactions.csv
|
208
|
-
|
209
|
-
# Run only substrate scope extraction with parallel processing
|
210
|
-
python -m debase.substrate_scope_extractor_parallel \
|
211
|
-
--manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
|
212
|
-
--max-workers 5 --output substrate_scope.csv
|
213
|
-
```
|
214
|
-
## Pipeline Architecture
|
215
|
-
|
216
|
-
The DEBase pipeline consists of 5 main steps:
|
217
|
-
|
218
|
-
1. **Lineage Extraction** (Sequential) - Identifies all enzymes and their relationships
|
219
|
-
- Extracts mutation information and evolutionary paths
|
220
|
-
- Detects multiple directed evolution campaigns automatically
|
221
|
-
- Fetches sequences from external databases (PDB, UniProt)
|
222
|
-
- Filters out non-enzyme entries automatically
|
223
|
-
2. **Sequence Cleanup** (Local) - Generates protein sequences from mutations
|
224
|
-
- Applies mutations to parent sequences
|
225
|
-
- Handles complex mutations and domain modifications
|
226
|
-
- Validates sequence integrity
|
227
|
-
3. **Reaction Extraction** (Parallel/Batch/Sequential) - Extracts reaction conditions and performance data
|
228
|
-
- Campaign-aware extraction for multi-lineage papers
|
229
|
-
- Vision-based extraction from figures and tables
|
230
|
-
- Automatic IUPAC name resolution
|
231
|
-
4. **Substrate Scope Extraction** (Parallel/Sequential) - Finds additional substrates tested
|
232
|
-
5. **Data Formatting** (Local) - Combines all data into final output
|
233
|
-
|
234
|
-
## Features
|
235
|
-
|
236
|
-
- **Multi-processing modes:** Sequential, parallel individual, and batch processing
|
237
|
-
- **Campaign detection:** Automatically identifies and separates multiple directed evolution campaigns
|
238
|
-
- **Intelligent error handling:** Automatic retries with exponential backoff
|
239
|
-
- **External database integration:** Automatic sequence fetching from PDB and UniProt
|
240
|
-
- **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
|
241
|
-
- **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
|
242
|
-
- **Vision capabilities:** Extracts data from both text and images in PDFs
|
243
|
-
|
244
|
-
## Complete Command Reference
|
245
|
-
|
246
|
-
### Core Arguments
|
247
|
-
```bash
|
248
|
-
--manuscript PATH # Required: Path to manuscript PDF
|
249
|
-
--si PATH # Optional: Path to supplementary information PDF
|
250
|
-
--output PATH # Output file path (default: manuscript_name_debase.csv)
|
251
|
-
```
|
252
|
-
|
253
|
-
### Performance Options
|
254
|
-
```bash
|
255
|
-
--use-parallel-individual # Use parallel processing (recommended)
|
256
|
-
--max-workers N # Number of parallel workers (default: 5)
|
257
|
-
--use-optimized-reaction # Use batch processing for speed
|
258
|
-
--reaction-batch-size N # Enzymes per batch (default: 5)
|
259
|
-
--no-parallel-queries # Disable parallel processing
|
260
|
-
```
|
261
|
-
|
262
|
-
### Pipeline Control
|
263
|
-
```bash
|
264
|
-
--skip-lineage # Skip lineage extraction step
|
265
|
-
--skip-sequence # Skip sequence cleanup step
|
266
|
-
--skip-reaction # Skip reaction extraction step
|
267
|
-
--skip-substrate-scope # Skip substrate scope extraction step
|
268
|
-
--skip-lineage-format # Skip final formatting step
|
269
|
-
--skip-validation # Skip data validation step
|
270
|
-
```
|
271
|
-
|
272
|
-
### Data Management
|
273
|
-
```bash
|
274
|
-
--existing-lineage PATH # Use existing lineage data
|
275
|
-
--existing-sequence PATH # Use existing sequence data
|
276
|
-
--existing-reaction PATH # Use existing reaction data
|
277
|
-
--keep-intermediates # Preserve intermediate files
|
278
|
-
```
|
279
|
-
|
280
|
-
### Advanced Options
|
281
|
-
```bash
|
282
|
-
--model-name NAME # Gemini model to use
|
283
|
-
--max-retries N # Maximum retry attempts (default: 2)
|
284
|
-
--max-chars N # Max characters from PDFs (default: 75000)
|
285
|
-
--debug-dir PATH # Directory for debug output (prompts, API responses)
|
286
|
-
```
|
287
|
-
|
288
|
-
## Tips for Best Performance
|
289
|
-
|
290
|
-
1. **Use parallel individual processing** for the best balance of speed and accuracy
|
291
|
-
2. **Set max-workers to 5** to avoid API rate limits while maximizing throughput
|
292
|
-
3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
|
293
|
-
4. **Skip validation** (`--skip-validation`) for faster processing in production
|
294
|
-
5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
|
295
|
-
6.
|
296
|
-
|
debase-0.4.3.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
-
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=r0b4fvQcrrvOScFMddjVgWAGNt17iQxCJH2xYW06jio,49
|
4
|
-
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
-
debase/cleanup_sequence.py,sha256=4qZrSXInyJKEJqcgcONp4IX24ALEj5lf7E0XaOZVxZ0,40329
|
6
|
-
debase/enzyme_lineage_extractor.py,sha256=tFyrcWkNKKr8T9xq0tIXUDNfcX0tbdWGrLhgo5m7lmA,129804
|
7
|
-
debase/lineage_format.py,sha256=Q6kpqKPUxJsMYpb0Yt8IbVlp6VDYX2vkITuGhT9MEbw,47056
|
8
|
-
debase/reaction_info_extractor.py,sha256=q-iHgfVLXP4r2Se8yA9I0AvtnAhHBltTztrXspl3EKU,151949
|
9
|
-
debase/substrate_scope_extractor.py,sha256=eaVimhxmmaRj-9dRN6RKK4yStCmZAuX8xBaarsIsmUo,114212
|
10
|
-
debase/wrapper.py,sha256=r0xxoiBvmMIktiGPOD4w9hne8m0SLzZ03WeWnBuDW0A,25236
|
11
|
-
debase-0.4.3.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
-
debase-0.4.3.dist-info/METADATA,sha256=UT8ymX3oothXvgA9ayr74_Bd-St7I0Pj7CoEg8LlKg8,10789
|
13
|
-
debase-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
-
debase-0.4.3.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
-
debase-0.4.3.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
-
debase-0.4.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|