debase 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/PIPELINE_FLOW.md +100 -0
- debase/_version.py +1 -1
- debase/reaction_info_extractor.py +38 -6
- {debase-0.1.1.dist-info → debase-0.1.2.dist-info}/METADATA +2 -61
- {debase-0.1.1.dist-info → debase-0.1.2.dist-info}/RECORD +9 -8
- {debase-0.1.1.dist-info → debase-0.1.2.dist-info}/WHEEL +0 -0
- {debase-0.1.1.dist-info → debase-0.1.2.dist-info}/entry_points.txt +0 -0
- {debase-0.1.1.dist-info → debase-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.1.dist-info → debase-0.1.2.dist-info}/top_level.txt +0 -0
debase/PIPELINE_FLOW.md
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
# DEBase Pipeline Flow
|
2
|
+
|
3
|
+
## Overview
|
4
|
+
The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
|
5
|
+
|
6
|
+
## Pipeline Architecture
|
7
|
+
|
8
|
+
```
|
9
|
+
┌─────────────────────┐ ┌─────────────────────┐
|
10
|
+
│ Manuscript PDF │ │ SI PDF │
|
11
|
+
└──────────┬──────────┘ └──────────┬──────────┘
|
12
|
+
│ │
|
13
|
+
└───────────┬───────────────┘
|
14
|
+
│
|
15
|
+
▼
|
16
|
+
┌─────────────────────────────┐
|
17
|
+
│ 1. enzyme_lineage_extractor │
|
18
|
+
│ - Extract enzyme variants │
|
19
|
+
│ - Parse mutations │
|
20
|
+
│ - Get basic metadata │
|
21
|
+
└─────────────┬───────────────┘
|
22
|
+
│
|
23
|
+
▼
|
24
|
+
┌─────────────────────────────┐
|
25
|
+
│ 2. cleanup_sequence │
|
26
|
+
│ - Validate sequences │
|
27
|
+
│ - Fix formatting issues │
|
28
|
+
│ - Generate full sequences │
|
29
|
+
└─────────────┬───────────────┘
|
30
|
+
│
|
31
|
+
┌───────────┴───────────────┐
|
32
|
+
│ │
|
33
|
+
▼ ▼
|
34
|
+
┌─────────────────────────┐ ┌─────────────────────────┐
|
35
|
+
│ 3a. reaction_info │ │ 3b. substrate_scope │
|
36
|
+
│ _extractor │ │ _extractor │
|
37
|
+
│ - Performance metrics │ │ - Substrate variations │
|
38
|
+
│ - Model reaction │ │ - Additional variants │
|
39
|
+
│ - Conditions │ │ - Scope data │
|
40
|
+
└───────────┬─────────────┘ └───────────┬─────────────┘
|
41
|
+
│ │
|
42
|
+
└───────────┬───────────────┘
|
43
|
+
│
|
44
|
+
▼
|
45
|
+
┌─────────────────────────────┐
|
46
|
+
│ 4. lineage_format_o3 │
|
47
|
+
│ - Merge all data │
|
48
|
+
│ - Fill missing sequences │
|
49
|
+
│ - Format final output │
|
50
|
+
└─────────────┬───────────────┘
|
51
|
+
│
|
52
|
+
▼
|
53
|
+
┌─────────────┐
|
54
|
+
│ Final CSV │
|
55
|
+
└─────────────┘
|
56
|
+
```
|
57
|
+
|
58
|
+
## Module Details
|
59
|
+
|
60
|
+
### 1. enzyme_lineage_extractor.py
|
61
|
+
- **Input**: Manuscript PDF, SI PDF
|
62
|
+
- **Output**: CSV with enzyme variants and mutations
|
63
|
+
- **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
|
64
|
+
|
65
|
+
### 2. cleanup_sequence.py
|
66
|
+
- **Input**: Enzyme lineage CSV
|
67
|
+
- **Output**: CSV with validated sequences
|
68
|
+
- **Function**: Validates protein sequences, generates full sequences from mutations
|
69
|
+
|
70
|
+
### 3a. reaction_info_extractor.py
|
71
|
+
- **Input**: PDFs + cleaned enzyme CSV
|
72
|
+
- **Output**: CSV with reaction performance data
|
73
|
+
- **Function**: Extracts yield, TTN, selectivity, reaction conditions
|
74
|
+
|
75
|
+
### 3b. substrate_scope_extractor.py
|
76
|
+
- **Input**: PDFs + cleaned enzyme CSV
|
77
|
+
- **Output**: CSV with substrate scope entries
|
78
|
+
- **Function**: Extracts substrate variations tested with different enzymes
|
79
|
+
|
80
|
+
### 4. lineage_format_o3.py
|
81
|
+
- **Input**: Reaction CSV + Substrate scope CSV
|
82
|
+
- **Output**: Final formatted CSV
|
83
|
+
- **Function**: Merges data, fills missing sequences, applies consistent formatting
|
84
|
+
|
85
|
+
## Key Features
|
86
|
+
|
87
|
+
1. **Modular Design**: Each step can be run independently
|
88
|
+
2. **Parallel Extraction**: Steps 3a and 3b run independently
|
89
|
+
3. **Error Recovery**: Pipeline can resume from any step
|
90
|
+
4. **Clean Interfaces**: Each module has well-defined inputs/outputs
|
91
|
+
|
92
|
+
## Usage
|
93
|
+
|
94
|
+
```bash
|
95
|
+
# Full pipeline
|
96
|
+
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
|
97
|
+
|
98
|
+
# With intermediate files kept for debugging
|
99
|
+
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
|
100
|
+
```
|
debase/_version.py
CHANGED
@@ -685,7 +685,7 @@ Ignore locations that contain data for other campaigns.
|
|
685
685
|
'confidence': 95
|
686
686
|
}
|
687
687
|
|
688
|
-
def find_lineage_model_reaction(self, location: str, group_context: str) -> Dict[str, Any]:
|
688
|
+
def find_lineage_model_reaction(self, location: str, group_context: str, model_reaction_locations: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
689
689
|
"""Find the model reaction for a specific lineage group."""
|
690
690
|
# Gather relevant text near this location
|
691
691
|
page_text = self._page_with_reference(location) or ""
|
@@ -693,6 +693,7 @@ Ignore locations that contain data for other campaigns.
|
|
693
693
|
# Also check manuscript introduction for model reaction info
|
694
694
|
intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
|
695
695
|
|
696
|
+
# Build the prompt with location and context
|
696
697
|
prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
|
697
698
|
location=location,
|
698
699
|
group_context=group_context
|
@@ -700,6 +701,22 @@ Ignore locations that contain data for other campaigns.
|
|
700
701
|
prompt += f"\n\nText near {location}:\n{page_text[:3000]}"
|
701
702
|
prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
|
702
703
|
|
704
|
+
# If we have model reaction locations, include text from those locations too
|
705
|
+
if model_reaction_locations:
|
706
|
+
# Add text from model reaction location
|
707
|
+
if model_reaction_locations.get("model_reaction_location", {}).get("location"):
|
708
|
+
model_loc = model_reaction_locations["model_reaction_location"]["location"]
|
709
|
+
model_text = self._get_text_around_location(model_loc)
|
710
|
+
if model_text:
|
711
|
+
prompt += f"\n\nText from {model_loc} (potential model reaction location):\n{model_text[:3000]}"
|
712
|
+
|
713
|
+
# Add text from conditions location (often contains reaction details)
|
714
|
+
if model_reaction_locations.get("conditions_location", {}).get("location"):
|
715
|
+
cond_loc = model_reaction_locations["conditions_location"]["location"]
|
716
|
+
cond_text = self._get_text_around_location(cond_loc)
|
717
|
+
if cond_text:
|
718
|
+
prompt += f"\n\nText from {cond_loc} (reaction conditions):\n{cond_text[:3000]}"
|
719
|
+
|
703
720
|
try:
|
704
721
|
data = generate_json_with_retry(
|
705
722
|
self.model,
|
@@ -1790,8 +1807,16 @@ TEXT FROM MANUSCRIPT:
|
|
1790
1807
|
if location.get('caption'):
|
1791
1808
|
location_context += f"\nCaption: {location['caption']}"
|
1792
1809
|
|
1793
|
-
#
|
1794
|
-
|
1810
|
+
# First find model reaction locations for this campaign/enzyme group
|
1811
|
+
location_enzymes = df_location['enzyme'].unique().tolist()
|
1812
|
+
model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
|
1813
|
+
|
1814
|
+
# Try to find model reaction for this specific lineage, passing the locations
|
1815
|
+
location_model_reaction = self.find_lineage_model_reaction(
|
1816
|
+
location['location'],
|
1817
|
+
location_context,
|
1818
|
+
model_reaction_locations
|
1819
|
+
)
|
1795
1820
|
|
1796
1821
|
# Get full model reaction info with IUPAC names
|
1797
1822
|
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
@@ -1799,7 +1824,6 @@ TEXT FROM MANUSCRIPT:
|
|
1799
1824
|
else:
|
1800
1825
|
# Fall back to general model reaction extraction
|
1801
1826
|
# Pass the enzyme variants from this location
|
1802
|
-
location_enzymes = df_location['enzyme'].unique().tolist()
|
1803
1827
|
model_info = self.gather_model_reaction_info(location_enzymes)
|
1804
1828
|
|
1805
1829
|
# Add model reaction info to all enzymes from this location
|
@@ -1891,7 +1915,16 @@ TEXT FROM MANUSCRIPT:
|
|
1891
1915
|
if group.get('caption'):
|
1892
1916
|
location_context += f"\nCaption: {group['caption']}"
|
1893
1917
|
|
1894
|
-
|
1918
|
+
# First find model reaction locations for this enzyme group
|
1919
|
+
location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
|
1920
|
+
model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
|
1921
|
+
|
1922
|
+
# Try to find model reaction for this specific lineage, passing the locations
|
1923
|
+
location_model_reaction = self.find_lineage_model_reaction(
|
1924
|
+
group_location,
|
1925
|
+
location_context,
|
1926
|
+
model_reaction_locations
|
1927
|
+
)
|
1895
1928
|
|
1896
1929
|
# Get full model reaction info with IUPAC names
|
1897
1930
|
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
@@ -1899,7 +1932,6 @@ TEXT FROM MANUSCRIPT:
|
|
1899
1932
|
else:
|
1900
1933
|
# Try to extract model reaction from this specific location
|
1901
1934
|
# Pass the enzyme variants that have data in this location
|
1902
|
-
location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
|
1903
1935
|
model_info = self.gather_model_reaction_info(location_enzymes)
|
1904
1936
|
|
1905
1937
|
# Add model reaction info to all enzymes from this location
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: debase
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: Enzyme lineage analysis and sequence extraction package
|
5
5
|
Home-page: https://github.com/YuemingLong/DEBase
|
6
6
|
Author: DEBase Team
|
@@ -64,13 +64,6 @@ Enzyme lineage analysis and sequence extraction package with advanced parallel p
|
|
64
64
|
```bash
|
65
65
|
pip install debase
|
66
66
|
```
|
67
|
-
|
68
|
-
For full functionality with chemical SMILES support:
|
69
|
-
|
70
|
-
```bash
|
71
|
-
pip install debase[rdkit]
|
72
|
-
```
|
73
|
-
|
74
67
|
## Requirements
|
75
68
|
|
76
69
|
- Python 3.8 or higher
|
@@ -139,13 +132,6 @@ debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-ba
|
|
139
132
|
debase --manuscript paper.pdf --si si.pdf # Default method
|
140
133
|
```
|
141
134
|
|
142
|
-
## Performance Comparison
|
143
|
-
|
144
|
-
| Method | Total Time | API Calls | Accuracy | Best For |
|
145
|
-
|--------|------------|-----------|----------|----------|
|
146
|
-
| Sequential | ~45 min | 44 calls | Highest | Small datasets |
|
147
|
-
| **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
|
148
|
-
| Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
|
149
135
|
|
150
136
|
## Advanced Usage
|
151
137
|
|
@@ -169,31 +155,6 @@ python -m debase.substrate_scope_extractor_parallel \
|
|
169
155
|
--manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
|
170
156
|
--max-workers 5 --output substrate_scope.csv
|
171
157
|
```
|
172
|
-
|
173
|
-
## Python API
|
174
|
-
|
175
|
-
```python
|
176
|
-
from debase.wrapper import run_pipeline
|
177
|
-
|
178
|
-
# Run full pipeline with parallel processing
|
179
|
-
run_pipeline(
|
180
|
-
manuscript_path="paper.pdf",
|
181
|
-
si_path="si.pdf",
|
182
|
-
output="output.csv",
|
183
|
-
use_parallel_individual=True,
|
184
|
-
max_workers=5
|
185
|
-
)
|
186
|
-
|
187
|
-
# For individual steps
|
188
|
-
from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
|
189
|
-
from debase.enzyme_lineage_extractor import setup_gemini_api
|
190
|
-
|
191
|
-
model = setup_gemini_api()
|
192
|
-
reaction_data = extract_reaction_info_parallel(
|
193
|
-
model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
|
194
|
-
)
|
195
|
-
```
|
196
|
-
|
197
158
|
## Pipeline Architecture
|
198
159
|
|
199
160
|
The DEBase pipeline consists of 5 main steps:
|
@@ -222,9 +183,6 @@ The DEBase pipeline consists of 5 main steps:
|
|
222
183
|
- **External database integration:** Automatic sequence fetching from PDB and UniProt
|
223
184
|
- **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
|
224
185
|
- **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
|
225
|
-
- **Progress tracking:** Real-time status updates
|
226
|
-
- **Flexible output:** CSV format with comprehensive chemical and performance data
|
227
|
-
- **Caching:** PDF encoding cache for improved performance
|
228
186
|
- **Vision capabilities:** Extracts data from both text and images in PDFs
|
229
187
|
|
230
188
|
## Complete Command Reference
|
@@ -234,7 +192,6 @@ The DEBase pipeline consists of 5 main steps:
|
|
234
192
|
--manuscript PATH # Required: Path to manuscript PDF
|
235
193
|
--si PATH # Optional: Path to supplementary information PDF
|
236
194
|
--output PATH # Output file path (default: manuscript_name_debase.csv)
|
237
|
-
--queries N # Number of consensus queries (default: 2)
|
238
195
|
```
|
239
196
|
|
240
197
|
### Performance Options
|
@@ -279,21 +236,5 @@ The DEBase pipeline consists of 5 main steps:
|
|
279
236
|
3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
|
280
237
|
4. **Skip validation** (`--skip-validation`) for faster processing in production
|
281
238
|
5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
|
282
|
-
6.
|
283
|
-
7. **Verify enzyme entries** - The system automatically filters out buffers and controls
|
284
|
-
|
285
|
-
## Troubleshooting
|
286
|
-
|
287
|
-
### No sequences found
|
288
|
-
- The extractor will automatically search PDB and UniProt databases
|
289
|
-
- Check the logs for which database IDs were found and attempted
|
290
|
-
- Sequences with PDB structures will be fetched with high confidence
|
291
|
-
|
292
|
-
### Incorrect enzyme extraction
|
293
|
-
- Non-enzyme entries (buffers, controls, media) are automatically filtered
|
294
|
-
- Check the log for entries marked as "Filtering out non-enzyme entry"
|
239
|
+
6.
|
295
240
|
|
296
|
-
### PDB matching issues
|
297
|
-
- The system uses AI to match PDB IDs to specific enzyme variants
|
298
|
-
- Increased context extraction ensures better matching accuracy
|
299
|
-
- Check logs for "Gemini PDB matching" entries to see the matching process
|
@@ -1,16 +1,17 @@
|
|
1
|
+
debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
|
1
2
|
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
3
|
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=
|
4
|
+
debase/_version.py,sha256=rPOdIIhUKYST-L457GFA8SWkOdMGZQAiiaWLSYHnVwc,49
|
4
5
|
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
6
|
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
6
7
|
debase/enzyme_lineage_extractor.py,sha256=1GcgHA-lQPRf9-bNDlvQIP8p-KsP3D2WhIuOtCVJ_ME,87276
|
7
8
|
debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
|
8
|
-
debase/reaction_info_extractor.py,sha256=
|
9
|
+
debase/reaction_info_extractor.py,sha256=vgXE4eFSmRUU_RPsW7E0vbP5mU0tjrhwk7UVqz_98yM,111469
|
9
10
|
debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
|
10
11
|
debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
|
11
|
-
debase-0.1.
|
12
|
-
debase-0.1.
|
13
|
-
debase-0.1.
|
14
|
-
debase-0.1.
|
15
|
-
debase-0.1.
|
16
|
-
debase-0.1.
|
12
|
+
debase-0.1.2.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
13
|
+
debase-0.1.2.dist-info/METADATA,sha256=t5JrPGNEtLsF3qXrSpHvn02_rlNGlkYv-NDubaXZa2w,9382
|
14
|
+
debase-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
debase-0.1.2.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
16
|
+
debase-0.1.2.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
17
|
+
debase-0.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|