debase 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +43 -6
- debase/reaction_info_extractor.py +14 -1
- {debase-0.1.2.dist-info → debase-0.1.4.dist-info}/METADATA +57 -1
- {debase-0.1.2.dist-info → debase-0.1.4.dist-info}/RECORD +9 -9
- {debase-0.1.2.dist-info → debase-0.1.4.dist-info}/WHEEL +0 -0
- {debase-0.1.2.dist-info → debase-0.1.4.dist-info}/entry_points.txt +0 -0
- {debase-0.1.2.dist-info → debase-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.2.dist-info → debase-0.1.4.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
@@ -800,15 +800,36 @@ def identify_evolution_locations(
|
|
800
800
|
_dump(f"=== CAMPAIGN MAPPING PROMPT ===\nLocation: {location_str}\n{'='*80}\n\n{mapping_prompt}", mapping_file)
|
801
801
|
|
802
802
|
response = model.generate_content(mapping_prompt)
|
803
|
-
|
803
|
+
response_text = _extract_text(response).strip()
|
804
|
+
|
805
|
+
# Extract just the campaign_id from the response
|
806
|
+
# Look for the campaign_id pattern in the response
|
807
|
+
campaign_id = None
|
808
|
+
for campaign in campaigns:
|
809
|
+
if hasattr(campaign, 'campaign_id') and campaign.campaign_id in response_text:
|
810
|
+
campaign_id = campaign.campaign_id
|
811
|
+
break
|
812
|
+
|
813
|
+
# If not found, try to extract the last line or quoted string
|
814
|
+
if not campaign_id:
|
815
|
+
# Try to find quoted string
|
816
|
+
quoted_match = re.search(r'"([^"]+)"', response_text)
|
817
|
+
if quoted_match:
|
818
|
+
campaign_id = quoted_match.group(1)
|
819
|
+
else:
|
820
|
+
# Take the last non-empty line
|
821
|
+
lines = [line.strip() for line in response_text.split('\n') if line.strip()]
|
822
|
+
if lines:
|
823
|
+
campaign_id = lines[-1].strip('"')
|
804
824
|
|
805
825
|
# Save mapping response to debug if provided
|
806
826
|
if debug_dir:
|
807
827
|
response_file = debug_path / f"campaign_mapping_response_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
|
808
|
-
_dump(f"=== CAMPAIGN MAPPING RESPONSE ===\nLocation: {location_str}\
|
828
|
+
_dump(f"=== CAMPAIGN MAPPING RESPONSE ===\nLocation: {location_str}\nFull response:\n{response_text}\nExtracted campaign_id: {campaign_id}\n{'='*80}", response_file)
|
809
829
|
|
810
830
|
# Add campaign_id to location
|
811
|
-
|
831
|
+
if campaign_id:
|
832
|
+
loc['campaign_id'] = campaign_id
|
812
833
|
log.info(f"Mapped {location_str} to campaign: {campaign_id}")
|
813
834
|
except Exception as exc:
|
814
835
|
log.warning(f"Failed to map location to campaign: {exc}")
|
@@ -1297,6 +1318,8 @@ _SEQUENCE_SCHEMA_HINT = """
|
|
1297
1318
|
_SEQ_LOC_PROMPT = """
|
1298
1319
|
Find where FULL-LENGTH protein or DNA sequences are located in this document.
|
1299
1320
|
|
1321
|
+
PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
|
1322
|
+
|
1300
1323
|
Look for table of contents entries or section listings that mention sequences.
|
1301
1324
|
Return a JSON array where each element has:
|
1302
1325
|
- "section": the section heading or description
|
@@ -1305,6 +1328,7 @@ Return a JSON array where each element has:
|
|
1305
1328
|
Focus on:
|
1306
1329
|
- Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
|
1307
1330
|
- Return the EXACT notation as shown.
|
1331
|
+
- Prioritize sections that mention "protein" or "amino acid" sequences
|
1308
1332
|
|
1309
1333
|
Return [] if no sequence sections are found.
|
1310
1334
|
Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
|
@@ -1465,10 +1489,16 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
|
|
1465
1489
|
# --- 7.3 Main extraction prompt ---------------------------------------------
|
1466
1490
|
_SEQ_EXTRACTION_PROMPT = """
|
1467
1491
|
Extract EVERY distinct enzyme-variant sequence you can find in the text.
|
1492
|
+
|
1493
|
+
IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
|
1494
|
+
- If an amino acid sequence exists for a variant, extract ONLY the aa_seq (set dna_seq to null)
|
1495
|
+
- Only extract dna_seq if NO amino acid sequence is available for that variant
|
1496
|
+
- This reduces redundancy since protein sequences are usually more relevant
|
1497
|
+
|
1468
1498
|
For each variant return:
|
1469
1499
|
* variant_id - the label used in the paper (e.g. "R4-10")
|
1470
1500
|
* aa_seq - amino-acid sequence (uppercase), or null
|
1471
|
-
* dna_seq - DNA sequence (A/C/G/T), or null
|
1501
|
+
* dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
|
1472
1502
|
|
1473
1503
|
Respond ONLY with **minified JSON** that matches the schema below.
|
1474
1504
|
NO markdown, no code fences, no commentary.
|
@@ -2029,8 +2059,15 @@ def run_pipeline(
|
|
2029
2059
|
sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
|
2030
2060
|
|
2031
2061
|
# 4a. Try PDB extraction if no sequences found -----------------------------
|
2032
|
-
if
|
2033
|
-
|
2062
|
+
# Check if we need PDB sequences (no sequences or only partial sequences)
|
2063
|
+
MIN_PROTEIN_LENGTH = 50 # Most proteins are >50 AA
|
2064
|
+
needs_pdb = (not sequences or
|
2065
|
+
all(s.aa_seq is None or (s.aa_seq and len(s.aa_seq) < MIN_PROTEIN_LENGTH)
|
2066
|
+
for s in sequences))
|
2067
|
+
|
2068
|
+
if needs_pdb:
|
2069
|
+
log.info("No full-length sequences found in paper (only partial sequences < %d AA), attempting PDB extraction...",
|
2070
|
+
MIN_PROTEIN_LENGTH)
|
2034
2071
|
|
2035
2072
|
# Extract PDB IDs from all PDFs
|
2036
2073
|
pdb_ids = []
|
@@ -1055,7 +1055,20 @@ Different campaigns may use different model reactions.
|
|
1055
1055
|
"""Extract text around a given location identifier."""
|
1056
1056
|
location_lower = location.lower()
|
1057
1057
|
|
1058
|
-
#
|
1058
|
+
# Handle compound locations like "Figure 2 caption and Section I"
|
1059
|
+
# Extract the first figure/table/scheme reference
|
1060
|
+
figure_match = re.search(r"(figure|scheme|table)\s*\d+", location_lower)
|
1061
|
+
if figure_match:
|
1062
|
+
primary_location = figure_match.group(0)
|
1063
|
+
# Try to find this primary location first
|
1064
|
+
for page_text in self.all_pages:
|
1065
|
+
if primary_location in page_text.lower():
|
1066
|
+
idx = page_text.lower().index(primary_location)
|
1067
|
+
start = max(0, idx - 500)
|
1068
|
+
end = min(len(page_text), idx + 3000)
|
1069
|
+
return page_text[start:end]
|
1070
|
+
|
1071
|
+
# Search in all pages for exact match
|
1059
1072
|
for page_text in self.all_pages:
|
1060
1073
|
if location_lower in page_text.lower():
|
1061
1074
|
# Find the location and extract context around it
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: debase
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: Enzyme lineage analysis and sequence extraction package
|
5
5
|
Home-page: https://github.com/YuemingLong/DEBase
|
6
6
|
Author: DEBase Team
|
@@ -61,14 +61,70 @@ Enzyme lineage analysis and sequence extraction package with advanced parallel p
|
|
61
61
|
|
62
62
|
## Installation
|
63
63
|
|
64
|
+
### Quick Install (PyPI)
|
64
65
|
```bash
|
65
66
|
pip install debase
|
66
67
|
```
|
68
|
+
|
69
|
+
### Development Setup with Conda (Recommended)
|
70
|
+
|
71
|
+
1. **Clone the repository**
|
72
|
+
```bash
|
73
|
+
git clone https://github.com/YuemingLong/DEBase.git
|
74
|
+
cd DEBase
|
75
|
+
```
|
76
|
+
|
77
|
+
2. **Create conda environment from provided file**
|
78
|
+
```bash
|
79
|
+
conda env create -f environment.yml
|
80
|
+
conda activate debase
|
81
|
+
```
|
82
|
+
|
83
|
+
3. **Install DEBase in development mode**
|
84
|
+
```bash
|
85
|
+
pip install -e .
|
86
|
+
```
|
87
|
+
|
88
|
+
### Manual Setup
|
89
|
+
|
90
|
+
If you prefer to set up the environment manually:
|
91
|
+
|
92
|
+
```bash
|
93
|
+
# Create new conda environment
|
94
|
+
conda create -n debase python=3.9
|
95
|
+
conda activate debase
|
96
|
+
|
97
|
+
# Install conda packages
|
98
|
+
conda install -c conda-forge pandas numpy matplotlib seaborn jupyter jupyterlab openpyxl biopython requests tqdm
|
99
|
+
|
100
|
+
# Install RDKit (optional - used for SMILES canonicalization)
|
101
|
+
conda install -c conda-forge rdkit
|
102
|
+
|
103
|
+
# Install pip-only packages
|
104
|
+
pip install PyMuPDF google-generativeai debase
|
105
|
+
```
|
106
|
+
|
107
|
+
**Note about RDKit**: RDKit is optional and only used for canonicalizing SMILES strings in the output. If not installed, DEBase will still function normally but SMILES strings won't be standardized.
|
108
|
+
|
67
109
|
## Requirements
|
68
110
|
|
69
111
|
- Python 3.8 or higher
|
70
112
|
- A Gemini API key (set as environment variable `GEMINI_API_KEY`)
|
71
113
|
|
114
|
+
### Setting up Gemini API Key
|
115
|
+
|
116
|
+
```bash
|
117
|
+
# Option 1: Export in your shell
|
118
|
+
export GEMINI_API_KEY="your-api-key-here"
|
119
|
+
|
120
|
+
# Option 2: Add to ~/.bashrc or ~/.zshrc for persistence
|
121
|
+
echo 'export GEMINI_API_KEY="your-api-key-here"' >> ~/.bashrc
|
122
|
+
source ~/.bashrc
|
123
|
+
|
124
|
+
# Option 3: Create .env file in project directory
|
125
|
+
echo 'GEMINI_API_KEY=your-api-key-here' > .env
|
126
|
+
```
|
127
|
+
|
72
128
|
## Recent Updates
|
73
129
|
|
74
130
|
- **Campaign-Aware Extraction**: Automatically detects and processes multiple directed evolution campaigns in a single paper
|
@@ -1,17 +1,17 @@
|
|
1
1
|
debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
|
2
2
|
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
3
3
|
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
4
|
-
debase/_version.py,sha256=
|
4
|
+
debase/_version.py,sha256=mcDHWqAxAKwMNAAyHmpWVDTK-zafQ1kQjmiwnsZbUD4,49
|
5
5
|
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
6
6
|
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
7
|
-
debase/enzyme_lineage_extractor.py,sha256=
|
7
|
+
debase/enzyme_lineage_extractor.py,sha256=s1kPOomvJjfMSN5odxeyXNmxiaOzXyOZICr4YUWU6j8,89288
|
8
8
|
debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
|
9
|
-
debase/reaction_info_extractor.py,sha256=
|
9
|
+
debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
|
10
10
|
debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
|
11
11
|
debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
|
12
|
-
debase-0.1.
|
13
|
-
debase-0.1.
|
14
|
-
debase-0.1.
|
15
|
-
debase-0.1.
|
16
|
-
debase-0.1.
|
17
|
-
debase-0.1.
|
12
|
+
debase-0.1.4.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
13
|
+
debase-0.1.4.dist-info/METADATA,sha256=fZwXCP1i1s0VNq7Ds5bd2ys3pONgaV1XCe_edUkQdRU,10789
|
14
|
+
debase-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
debase-0.1.4.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
16
|
+
debase-0.1.4.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
17
|
+
debase-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|