debase 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/reaction_info_extractor.py +38 -6
- debase/wrapper.py +5 -7
- {debase-0.1.0.dist-info → debase-0.1.2.dist-info}/METADATA +2 -61
- {debase-0.1.0.dist-info → debase-0.1.2.dist-info}/RECORD +9 -9
- {debase-0.1.0.dist-info → debase-0.1.2.dist-info}/WHEEL +0 -0
- {debase-0.1.0.dist-info → debase-0.1.2.dist-info}/entry_points.txt +0 -0
- {debase-0.1.0.dist-info → debase-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.0.dist-info → debase-0.1.2.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
@@ -685,7 +685,7 @@ Ignore locations that contain data for other campaigns.
|
|
685
685
|
'confidence': 95
|
686
686
|
}
|
687
687
|
|
688
|
-
def find_lineage_model_reaction(self, location: str, group_context: str) -> Dict[str, Any]:
|
688
|
+
def find_lineage_model_reaction(self, location: str, group_context: str, model_reaction_locations: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
689
689
|
"""Find the model reaction for a specific lineage group."""
|
690
690
|
# Gather relevant text near this location
|
691
691
|
page_text = self._page_with_reference(location) or ""
|
@@ -693,6 +693,7 @@ Ignore locations that contain data for other campaigns.
|
|
693
693
|
# Also check manuscript introduction for model reaction info
|
694
694
|
intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
|
695
695
|
|
696
|
+
# Build the prompt with location and context
|
696
697
|
prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
|
697
698
|
location=location,
|
698
699
|
group_context=group_context
|
@@ -700,6 +701,22 @@ Ignore locations that contain data for other campaigns.
|
|
700
701
|
prompt += f"\n\nText near {location}:\n{page_text[:3000]}"
|
701
702
|
prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
|
702
703
|
|
704
|
+
# If we have model reaction locations, include text from those locations too
|
705
|
+
if model_reaction_locations:
|
706
|
+
# Add text from model reaction location
|
707
|
+
if model_reaction_locations.get("model_reaction_location", {}).get("location"):
|
708
|
+
model_loc = model_reaction_locations["model_reaction_location"]["location"]
|
709
|
+
model_text = self._get_text_around_location(model_loc)
|
710
|
+
if model_text:
|
711
|
+
prompt += f"\n\nText from {model_loc} (potential model reaction location):\n{model_text[:3000]}"
|
712
|
+
|
713
|
+
# Add text from conditions location (often contains reaction details)
|
714
|
+
if model_reaction_locations.get("conditions_location", {}).get("location"):
|
715
|
+
cond_loc = model_reaction_locations["conditions_location"]["location"]
|
716
|
+
cond_text = self._get_text_around_location(cond_loc)
|
717
|
+
if cond_text:
|
718
|
+
prompt += f"\n\nText from {cond_loc} (reaction conditions):\n{cond_text[:3000]}"
|
719
|
+
|
703
720
|
try:
|
704
721
|
data = generate_json_with_retry(
|
705
722
|
self.model,
|
@@ -1790,8 +1807,16 @@ TEXT FROM MANUSCRIPT:
|
|
1790
1807
|
if location.get('caption'):
|
1791
1808
|
location_context += f"\nCaption: {location['caption']}"
|
1792
1809
|
|
1793
|
-
#
|
1794
|
-
|
1810
|
+
# First find model reaction locations for this campaign/enzyme group
|
1811
|
+
location_enzymes = df_location['enzyme'].unique().tolist()
|
1812
|
+
model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
|
1813
|
+
|
1814
|
+
# Try to find model reaction for this specific lineage, passing the locations
|
1815
|
+
location_model_reaction = self.find_lineage_model_reaction(
|
1816
|
+
location['location'],
|
1817
|
+
location_context,
|
1818
|
+
model_reaction_locations
|
1819
|
+
)
|
1795
1820
|
|
1796
1821
|
# Get full model reaction info with IUPAC names
|
1797
1822
|
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
@@ -1799,7 +1824,6 @@ TEXT FROM MANUSCRIPT:
|
|
1799
1824
|
else:
|
1800
1825
|
# Fall back to general model reaction extraction
|
1801
1826
|
# Pass the enzyme variants from this location
|
1802
|
-
location_enzymes = df_location['enzyme'].unique().tolist()
|
1803
1827
|
model_info = self.gather_model_reaction_info(location_enzymes)
|
1804
1828
|
|
1805
1829
|
# Add model reaction info to all enzymes from this location
|
@@ -1891,7 +1915,16 @@ TEXT FROM MANUSCRIPT:
|
|
1891
1915
|
if group.get('caption'):
|
1892
1916
|
location_context += f"\nCaption: {group['caption']}"
|
1893
1917
|
|
1894
|
-
|
1918
|
+
# First find model reaction locations for this enzyme group
|
1919
|
+
location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
|
1920
|
+
model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
|
1921
|
+
|
1922
|
+
# Try to find model reaction for this specific lineage, passing the locations
|
1923
|
+
location_model_reaction = self.find_lineage_model_reaction(
|
1924
|
+
group_location,
|
1925
|
+
location_context,
|
1926
|
+
model_reaction_locations
|
1927
|
+
)
|
1895
1928
|
|
1896
1929
|
# Get full model reaction info with IUPAC names
|
1897
1930
|
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
@@ -1899,7 +1932,6 @@ TEXT FROM MANUSCRIPT:
|
|
1899
1932
|
else:
|
1900
1933
|
# Try to extract model reaction from this specific location
|
1901
1934
|
# Pass the enzyme variants that have data in this location
|
1902
|
-
location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
|
1903
1935
|
model_info = self.gather_model_reaction_info(location_enzymes)
|
1904
1936
|
|
1905
1937
|
# Add model reaction info to all enzymes from this location
|
debase/wrapper.py
CHANGED
@@ -35,9 +35,7 @@ def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir:
|
|
35
35
|
"""
|
36
36
|
logger.info(f"Extracting enzyme lineage from {manuscript.name}")
|
37
37
|
|
38
|
-
import
|
39
|
-
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
40
|
-
from src.debase.enzyme_lineage_extractor import run_pipeline
|
38
|
+
from .enzyme_lineage_extractor import run_pipeline
|
41
39
|
run_pipeline(manuscript=manuscript, si=si, output_csv=output, debug_dir=debug_dir)
|
42
40
|
|
43
41
|
logger.info(f"Lineage extraction complete: {output}")
|
@@ -51,7 +49,7 @@ def run_sequence_cleanup(input_csv: Path, output_csv: Path) -> Path:
|
|
51
49
|
"""
|
52
50
|
logger.info(f"Cleaning sequences from {input_csv.name}")
|
53
51
|
|
54
|
-
from
|
52
|
+
from .cleanup_sequence import main as cleanup_sequences
|
55
53
|
cleanup_sequences([str(input_csv), str(output_csv)])
|
56
54
|
|
57
55
|
logger.info(f"Sequence cleanup complete: {output_csv}")
|
@@ -65,7 +63,7 @@ def run_reaction_extraction(manuscript: Path, si: Path, lineage_csv: Path, outpu
|
|
65
63
|
"""
|
66
64
|
logger.info(f"Extracting reaction info for enzymes in {lineage_csv.name}")
|
67
65
|
|
68
|
-
from
|
66
|
+
from .reaction_info_extractor import ReactionExtractor, Config
|
69
67
|
import pandas as pd
|
70
68
|
|
71
69
|
# Load enzyme data
|
@@ -89,7 +87,7 @@ def run_substrate_scope_extraction(manuscript: Path, si: Path, lineage_csv: Path
|
|
89
87
|
"""
|
90
88
|
logger.info(f"Extracting substrate scope for enzymes in {lineage_csv.name}")
|
91
89
|
|
92
|
-
from
|
90
|
+
from .substrate_scope_extractor import run_pipeline
|
93
91
|
|
94
92
|
# Run substrate scope extraction
|
95
93
|
run_pipeline(
|
@@ -111,7 +109,7 @@ def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_cs
|
|
111
109
|
"""
|
112
110
|
logger.info(f"Formatting and merging data into final output")
|
113
111
|
|
114
|
-
from
|
112
|
+
from .lineage_format import run_pipeline
|
115
113
|
import pandas as pd
|
116
114
|
|
117
115
|
# First, we need to merge the protein sequences into the reaction data
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: debase
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: Enzyme lineage analysis and sequence extraction package
|
5
5
|
Home-page: https://github.com/YuemingLong/DEBase
|
6
6
|
Author: DEBase Team
|
@@ -64,13 +64,6 @@ Enzyme lineage analysis and sequence extraction package with advanced parallel p
|
|
64
64
|
```bash
|
65
65
|
pip install debase
|
66
66
|
```
|
67
|
-
|
68
|
-
For full functionality with chemical SMILES support:
|
69
|
-
|
70
|
-
```bash
|
71
|
-
pip install debase[rdkit]
|
72
|
-
```
|
73
|
-
|
74
67
|
## Requirements
|
75
68
|
|
76
69
|
- Python 3.8 or higher
|
@@ -139,13 +132,6 @@ debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-ba
|
|
139
132
|
debase --manuscript paper.pdf --si si.pdf # Default method
|
140
133
|
```
|
141
134
|
|
142
|
-
## Performance Comparison
|
143
|
-
|
144
|
-
| Method | Total Time | API Calls | Accuracy | Best For |
|
145
|
-
|--------|------------|-----------|----------|----------|
|
146
|
-
| Sequential | ~45 min | 44 calls | Highest | Small datasets |
|
147
|
-
| **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
|
148
|
-
| Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
|
149
135
|
|
150
136
|
## Advanced Usage
|
151
137
|
|
@@ -169,31 +155,6 @@ python -m debase.substrate_scope_extractor_parallel \
|
|
169
155
|
--manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
|
170
156
|
--max-workers 5 --output substrate_scope.csv
|
171
157
|
```
|
172
|
-
|
173
|
-
## Python API
|
174
|
-
|
175
|
-
```python
|
176
|
-
from debase.wrapper import run_pipeline
|
177
|
-
|
178
|
-
# Run full pipeline with parallel processing
|
179
|
-
run_pipeline(
|
180
|
-
manuscript_path="paper.pdf",
|
181
|
-
si_path="si.pdf",
|
182
|
-
output="output.csv",
|
183
|
-
use_parallel_individual=True,
|
184
|
-
max_workers=5
|
185
|
-
)
|
186
|
-
|
187
|
-
# For individual steps
|
188
|
-
from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
|
189
|
-
from debase.enzyme_lineage_extractor import setup_gemini_api
|
190
|
-
|
191
|
-
model = setup_gemini_api()
|
192
|
-
reaction_data = extract_reaction_info_parallel(
|
193
|
-
model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
|
194
|
-
)
|
195
|
-
```
|
196
|
-
|
197
158
|
## Pipeline Architecture
|
198
159
|
|
199
160
|
The DEBase pipeline consists of 5 main steps:
|
@@ -222,9 +183,6 @@ The DEBase pipeline consists of 5 main steps:
|
|
222
183
|
- **External database integration:** Automatic sequence fetching from PDB and UniProt
|
223
184
|
- **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
|
224
185
|
- **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
|
225
|
-
- **Progress tracking:** Real-time status updates
|
226
|
-
- **Flexible output:** CSV format with comprehensive chemical and performance data
|
227
|
-
- **Caching:** PDF encoding cache for improved performance
|
228
186
|
- **Vision capabilities:** Extracts data from both text and images in PDFs
|
229
187
|
|
230
188
|
## Complete Command Reference
|
@@ -234,7 +192,6 @@ The DEBase pipeline consists of 5 main steps:
|
|
234
192
|
--manuscript PATH # Required: Path to manuscript PDF
|
235
193
|
--si PATH # Optional: Path to supplementary information PDF
|
236
194
|
--output PATH # Output file path (default: manuscript_name_debase.csv)
|
237
|
-
--queries N # Number of consensus queries (default: 2)
|
238
195
|
```
|
239
196
|
|
240
197
|
### Performance Options
|
@@ -279,21 +236,5 @@ The DEBase pipeline consists of 5 main steps:
|
|
279
236
|
3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
|
280
237
|
4. **Skip validation** (`--skip-validation`) for faster processing in production
|
281
238
|
5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
|
282
|
-
6.
|
283
|
-
7. **Verify enzyme entries** - The system automatically filters out buffers and controls
|
284
|
-
|
285
|
-
## Troubleshooting
|
286
|
-
|
287
|
-
### No sequences found
|
288
|
-
- The extractor will automatically search PDB and UniProt databases
|
289
|
-
- Check the logs for which database IDs were found and attempted
|
290
|
-
- Sequences with PDB structures will be fetched with high confidence
|
291
|
-
|
292
|
-
### Incorrect enzyme extraction
|
293
|
-
- Non-enzyme entries (buffers, controls, media) are automatically filtered
|
294
|
-
- Check the log for entries marked as "Filtering out non-enzyme entry"
|
239
|
+
6.
|
295
240
|
|
296
|
-
### PDB matching issues
|
297
|
-
- The system uses AI to match PDB IDs to specific enzyme variants
|
298
|
-
- Increased context extraction ensures better matching accuracy
|
299
|
-
- Check logs for "Gemini PDB matching" entries to see the matching process
|
@@ -1,17 +1,17 @@
|
|
1
1
|
debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
|
2
2
|
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
3
3
|
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
4
|
-
debase/_version.py,sha256=
|
4
|
+
debase/_version.py,sha256=rPOdIIhUKYST-L457GFA8SWkOdMGZQAiiaWLSYHnVwc,49
|
5
5
|
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
6
6
|
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
7
7
|
debase/enzyme_lineage_extractor.py,sha256=1GcgHA-lQPRf9-bNDlvQIP8p-KsP3D2WhIuOtCVJ_ME,87276
|
8
8
|
debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
|
9
|
-
debase/reaction_info_extractor.py,sha256=
|
9
|
+
debase/reaction_info_extractor.py,sha256=vgXE4eFSmRUU_RPsW7E0vbP5mU0tjrhwk7UVqz_98yM,111469
|
10
10
|
debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
|
11
|
-
debase/wrapper.py,sha256=
|
12
|
-
debase-0.1.
|
13
|
-
debase-0.1.
|
14
|
-
debase-0.1.
|
15
|
-
debase-0.1.
|
16
|
-
debase-0.1.
|
17
|
-
debase-0.1.
|
11
|
+
debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
|
12
|
+
debase-0.1.2.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
13
|
+
debase-0.1.2.dist-info/METADATA,sha256=t5JrPGNEtLsF3qXrSpHvn02_rlNGlkYv-NDubaXZa2w,9382
|
14
|
+
debase-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
debase-0.1.2.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
16
|
+
debase-0.1.2.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
17
|
+
debase-0.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|