debase 0.6.0__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {debase-0.6.0/src/debase.egg-info → debase-0.6.1}/PKG-INFO +1 -1
  2. debase-0.6.1/manuscript/DEBase_LLM_Validater.ipynb +197 -0
  3. {debase-0.6.0 → debase-0.6.1}/src/debase/_version.py +1 -1
  4. debase-0.6.1/src/debase/campaign_utils.py +146 -0
  5. debase-0.6.1/src/debase/caption_pattern.py +39 -0
  6. {debase-0.6.0 → debase-0.6.1}/src/debase/enzyme_lineage_extractor.py +58 -20
  7. {debase-0.6.0 → debase-0.6.1}/src/debase/reaction_info_extractor.py +407 -75
  8. {debase-0.6.0 → debase-0.6.1}/src/debase/substrate_scope_extractor.py +124 -49
  9. {debase-0.6.0 → debase-0.6.1}/src/debase/wrapper.py +3 -3
  10. {debase-0.6.0 → debase-0.6.1/src/debase.egg-info}/PKG-INFO +1 -1
  11. {debase-0.6.0 → debase-0.6.1}/src/debase.egg-info/SOURCES.txt +3 -0
  12. {debase-0.6.0 → debase-0.6.1}/.gitignore +0 -0
  13. {debase-0.6.0 → debase-0.6.1}/LICENSE +0 -0
  14. {debase-0.6.0 → debase-0.6.1}/MANIFEST.in +0 -0
  15. {debase-0.6.0 → debase-0.6.1}/README.md +0 -0
  16. {debase-0.6.0 → debase-0.6.1}/environment.yml +0 -0
  17. {debase-0.6.0 → debase-0.6.1}/pyproject.toml +0 -0
  18. {debase-0.6.0 → debase-0.6.1}/setup.cfg +0 -0
  19. {debase-0.6.0 → debase-0.6.1}/setup.py +0 -0
  20. {debase-0.6.0 → debase-0.6.1}/src/__init__.py +0 -0
  21. {debase-0.6.0 → debase-0.6.1}/src/debase/__init__.py +0 -0
  22. {debase-0.6.0 → debase-0.6.1}/src/debase/__main__.py +0 -0
  23. {debase-0.6.0 → debase-0.6.1}/src/debase/build_db.py +0 -0
  24. {debase-0.6.0 → debase-0.6.1}/src/debase/cleanup_sequence.py +0 -0
  25. {debase-0.6.0 → debase-0.6.1}/src/debase/lineage_format.py +0 -0
  26. {debase-0.6.0 → debase-0.6.1}/src/debase.egg-info/dependency_links.txt +0 -0
  27. {debase-0.6.0 → debase-0.6.1}/src/debase.egg-info/entry_points.txt +0 -0
  28. {debase-0.6.0 → debase-0.6.1}/src/debase.egg-info/requires.txt +0 -0
  29. {debase-0.6.0 → debase-0.6.1}/src/debase.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,197 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "1714f461-f0c0-4a33-8b81-54c40ce47b80",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Test all the downloaded papers against the LLM extractor\n",
9
+ "\n",
10
+ "Command: \n",
11
+ "```\n",
12
+ "debase --manuscript \"downloaded_papers/Stereospecific Enzymatic Conversion of Boronic Acids to Amines.pdf\" --si \"downloaded_papers/si-Stereospecific Enzymatic Conversion of Boronic Acids to Amines.pdf\" --output \"LLM/Stereospecific Enzymatic Conversion of Boronic Acids to Amines.csv\" --keep-intermediates\n",
13
+ "```"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "id": "a77e19f4-ef52-469d-8a15-6816101fa58b",
20
+ "metadata": {},
21
+ "outputs": [
22
+ {
23
+ "name": "stderr",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "2025-07-16 09:04:29 [WARNING] debase.lineage_format: Local PubChem DB not found at /Users/arianemora/miniconda3/envs/debase/lib/python3.11/data/iupac2smiles.db\n",
27
+ "usage: debase [-h] [--manuscript MANUSCRIPT] [--si SI] [--output OUTPUT]\n",
28
+ " [--keep-intermediates] [--debug-dir DEBUG_DIR]\n",
29
+ "debase: error: SI not found: downloaded_papers/si-Enzymatic Nitrogen Insertion into Unactivated C-H Bonds.pdf\n"
30
+ ]
31
+ },
32
+ {
33
+ "name": "stdout",
34
+ "output_type": "stream",
35
+ "text": [
36
+ "TIME: 0.6438441276550293\n"
37
+ ]
38
+ },
39
+ {
40
+ "name": "stderr",
41
+ "output_type": "stream",
42
+ "text": [
43
+ "2025-07-16 09:04:30 [WARNING] debase.lineage_format: Local PubChem DB not found at /Users/arianemora/miniconda3/envs/debase/lib/python3.11/data/iupac2smiles.db\n",
44
+ "2025-07-16 09:04:30,049 - INFO - ============================================================\n",
45
+ "2025-07-16 09:04:30,049 - INFO - Starting DEBase Enzyme Analysis Pipeline\n",
46
+ "2025-07-16 09:04:30,049 - INFO - Manuscript: downloaded_papers/Nitrene Transfer Catalyzed by a Non-Heme Iron Enzyme and Enhanced by Non-Native Small-Molecule Ligan.pdf\n",
47
+ "2025-07-16 09:04:30,049 - INFO - SI: downloaded_papers/si-Nitrene Transfer Catalyzed by a Non-Heme Iron Enzyme and Enhanced by Non-Native Small-Molecule Ligan.pdf\n",
48
+ "2025-07-16 09:04:30,049 - INFO - Output: LLM/manuscript_1/Nitrene Transfer Catalyzed by a Non-Heme Iron Enzyme and Enhanced by Non-Native Small-Molecule Ligan.csv\n",
49
+ "2025-07-16 09:04:30,049 - INFO - Log file: LLM/manuscript_1/debase_pipeline_20250716_090430.log\n",
50
+ "2025-07-16 09:04:30,049 - INFO - ============================================================\n",
51
+ "2025-07-16 09:04:30,049 - INFO - \n",
52
+ "[Step 1/5] Extracting enzyme lineage...\n",
53
+ "2025-07-16 09:04:30,049 - INFO - Extracting enzyme lineage from Nitrene Transfer Catalyzed by a Non-Heme Iron Enzyme and Enhanced by Non-Native Small-Molecule Ligan.pdf\n",
54
+ "2025-07-16 09:04:30 [INFO] debase.enzyme_lineage_extractor: Loaded 11495 chars of captions for identification and 80929 chars of full text for extraction\n",
55
+ "2025-07-16 09:04:30,454 - INFO - Loaded 11495 chars of captions for identification and 80929 chars of full text for extraction\n",
56
+ "2025-07-16 09:04:30 [INFO] debase.enzyme_lineage_extractor: === GEMINI API CALL: CAMPAIGNS ===\n",
57
+ "2025-07-16 09:04:30,454 - INFO - === GEMINI API CALL: CAMPAIGNS ===\n",
58
+ "2025-07-16 09:04:30 [INFO] debase.enzyme_lineage_extractor: Prompt length: 82371 characters\n",
59
+ "2025-07-16 09:04:30,454 - INFO - Prompt length: 82371 characters\n",
60
+ "2025-07-16 09:04:30 [INFO] debase.enzyme_lineage_extractor: First 500 chars of prompt:\n",
61
+ "You are an expert reader of protein engineering manuscripts.\n",
62
+ "Analyze the following manuscript text to identify ALL distinct directed evolution campaigns.\n",
63
+ "\n",
64
+ "Each campaign represents a separate evolutionary lineage targeting different:\n",
65
+ "- Model reactions (e.g., different chemical transformations)\n",
66
+ "- Substrate scopes\n",
67
+ "- Activities (e.g., different enzymatic reactions)\n",
68
+ "\n",
69
+ "Look for:\n",
70
+ "1. Different model substrates/products mentioned (e.g., different substrate/product pairs)\n",
71
+ "2. Distinct enzyme lineage names (\n",
72
+ "...(truncated)\n",
73
+ "2025-07-16 09:04:30,454 - INFO - First 500 chars of prompt:\n",
74
+ "You are an expert reader of protein engineering manuscripts.\n",
75
+ "Analyze the following manuscript text to identify ALL distinct directed evolution campaigns.\n",
76
+ "\n",
77
+ "Each campaign represents a separate evolutionary lineage targeting different:\n",
78
+ "- Model reactions (e.g., different chemical transformations)\n",
79
+ "- Substrate scopes\n",
80
+ "- Activities (e.g., different enzymatic reactions)\n",
81
+ "\n",
82
+ "Look for:\n",
83
+ "1. Different model substrates/products mentioned (e.g., different substrate/product pairs)\n",
84
+ "2. Distinct enzyme lineage names (\n",
85
+ "...(truncated)\n",
86
+ "2025-07-16 09:04:30 [INFO] debase.enzyme_lineage_extractor: Calling Gemini API (attempt 1/4)...\n",
87
+ "2025-07-16 09:04:30,454 - INFO - Calling Gemini API (attempt 1/4)...\n",
88
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Gemini response length: 2188 characters\n",
89
+ "2025-07-16 09:04:49,096 - INFO - Gemini response length: 2188 characters\n",
90
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: First 500 chars of response:\n",
91
+ "```json\n",
92
+ "[\n",
93
+ " {\n",
94
+ " \"campaign_id\": \"ps_efe_olefin_aziridination_evolution\",\n",
95
+ " \"campaign_name\": \"PsEFE Olefin Aziridination Evolution\",\n",
96
+ " \"description\": \"Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\",\n",
97
+ " \"model_substrate\": \"styrene and p-toluenesulfonyl azide\",\n",
98
+ " \"model_product\": \"2-phenyl-1-(p-toluenesulfonyl)aziridine\",\n",
99
+ " \"\n",
100
+ "...(truncated)\n",
101
+ "2025-07-16 09:04:49,096 - INFO - First 500 chars of response:\n",
102
+ "```json\n",
103
+ "[\n",
104
+ " {\n",
105
+ " \"campaign_id\": \"ps_efe_olefin_aziridination_evolution\",\n",
106
+ " \"campaign_name\": \"PsEFE Olefin Aziridination Evolution\",\n",
107
+ " \"description\": \"Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\",\n",
108
+ " \"model_substrate\": \"styrene and p-toluenesulfonyl azide\",\n",
109
+ " \"model_product\": \"2-phenyl-1-(p-toluenesulfonyl)aziridine\",\n",
110
+ " \"\n",
111
+ "...(truncated)\n",
112
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Successfully parsed JSON response\n",
113
+ "2025-07-16 09:04:49,096 - INFO - Successfully parsed JSON response\n",
114
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Identified campaign: PsEFE Olefin Aziridination Evolution (ps_efe_olefin_aziridination_evolution)\n",
115
+ "2025-07-16 09:04:49,096 - INFO - Identified campaign: PsEFE Olefin Aziridination Evolution (ps_efe_olefin_aziridination_evolution)\n",
116
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Identified campaign: PsEFE Nitrene C-H Insertion Screening and Optimization (ps_efe_nitrene_ch_insertion_screening)\n",
117
+ "2025-07-16 09:04:49,096 - INFO - Identified campaign: PsEFE Nitrene C-H Insertion Screening and Optimization (ps_efe_nitrene_ch_insertion_screening)\n",
118
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Identified 2 distinct campaigns\n",
119
+ "2025-07-16 09:04:49,096 - INFO - Identified 2 distinct campaigns\n",
120
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: - PsEFE Olefin Aziridination Evolution: Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\n",
121
+ "2025-07-16 09:04:49,096 - INFO - - PsEFE Olefin Aziridination Evolution: Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\n",
122
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: - PsEFE Nitrene C-H Insertion Screening and Optimization: Screening and characterization of PsEFE variants (derived from the aziridination evolution) for intramolecular nitrene C-H bond insertion, identifying variants with enhanced activity, chemoselectivity, and enantioselectivity, particularly with N-oxalylglycine as a ligand.\n",
123
+ "2025-07-16 09:04:49,096 - INFO - - PsEFE Nitrene C-H Insertion Screening and Optimization: Screening and characterization of PsEFE variants (derived from the aziridination evolution) for intramolecular nitrene C-H bond insertion, identifying variants with enhanced activity, chemoselectivity, and enantioselectivity, particularly with N-oxalylglycine as a ligand.\n",
124
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Using campaign-aware location identification\n",
125
+ "2025-07-16 09:04:49,096 - INFO - Using campaign-aware location identification\n",
126
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: \n",
127
+ "Processing campaign: ps_efe_olefin_aziridination_evolution - PsEFE Olefin Aziridination Evolution\n",
128
+ "2025-07-16 09:04:49,096 - INFO - \n",
129
+ "Processing campaign: ps_efe_olefin_aziridination_evolution - PsEFE Olefin Aziridination Evolution\n",
130
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: === GEMINI API CALL: LOCATE ===\n",
131
+ "2025-07-16 09:04:49,143 - INFO - === GEMINI API CALL: LOCATE ===\n",
132
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Prompt length: 18517 characters\n",
133
+ "2025-07-16 09:04:49,143 - INFO - Prompt length: 18517 characters\n",
134
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: First 500 chars of prompt:\n",
135
+ "You are an expert reader of protein engineering manuscripts.\n",
136
+ "\n",
137
+ "You are looking for lineage data for a SPECIFIC campaign:\n",
138
+ "- Campaign: PsEFE Olefin Aziridination Evolution\n",
139
+ "- Description: Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\n",
140
+ "- Key identifiers: Initial screening was performed with α-ketoglutarate, but subsequent evolution and sc\n",
141
+ "...(truncated)\n",
142
+ "2025-07-16 09:04:49,143 - INFO - First 500 chars of prompt:\n",
143
+ "You are an expert reader of protein engineering manuscripts.\n",
144
+ "\n",
145
+ "You are looking for lineage data for a SPECIFIC campaign:\n",
146
+ "- Campaign: PsEFE Olefin Aziridination Evolution\n",
147
+ "- Description: Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\n",
148
+ "- Key identifiers: Initial screening was performed with α-ketoglutarate, but subsequent evolution and sc\n",
149
+ "...(truncated)\n",
150
+ "2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Calling Gemini API (attempt 1/4)...\n",
151
+ "2025-07-16 09:04:49,143 - INFO - Calling Gemini API (attempt 1/4)...\n"
152
+ ]
153
+ }
154
+ ],
155
+ "source": [
156
+ "import time\n",
157
+ "import os \n",
158
+ "\n",
159
+ "data_dir = 'downloaded_papers/'\n",
160
+ "files = [f for f in os.listdir(data_dir) if f[:3] != 'si-']\n",
161
+ "\n",
162
+ "with open('time_log.txt', 'w+') as fout:\n",
163
+ " for i, f in enumerate(files):\n",
164
+ " start = time.time() \n",
165
+ " output = f.replace('.pdf', '.csv')\n",
166
+ " os.system(f'mkdir LLM/manuscript_{i}/')\n",
167
+ " cmd = f'debase --manuscript \"downloaded_papers/{f}\" --si \"downloaded_papers/si-{f}\" --output \"LLM/manuscript_{i}/{output}\" --keep-intermediates'\n",
168
+ " os.system(cmd)\n",
169
+ " end = time.time()\n",
170
+ " \n",
171
+ " print('TIME:', end - start)\n",
172
+ " fout.write(f'{f}\\tmanuscript_{i}\\t{end-start}\\t{cmd}\\n')"
173
+ ]
174
+ }
175
+ ],
176
+ "metadata": {
177
+ "kernelspec": {
178
+ "display_name": "Python 3 (ipykernel)",
179
+ "language": "python",
180
+ "name": "python3"
181
+ },
182
+ "language_info": {
183
+ "codemirror_mode": {
184
+ "name": "ipython",
185
+ "version": 3
186
+ },
187
+ "file_extension": ".py",
188
+ "mimetype": "text/x-python",
189
+ "name": "python",
190
+ "nbconvert_exporter": "python",
191
+ "pygments_lexer": "ipython3",
192
+ "version": "3.11.13"
193
+ }
194
+ },
195
+ "nbformat": 4,
196
+ "nbformat_minor": 5
197
+ }
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.6.0"
3
+ __version__ = "0.6.1"
@@ -0,0 +1,146 @@
1
+ """Utilities for handling campaign information across extractors.
2
+
3
+ This module provides functions to load and use campaign information
4
+ to improve extraction accuracy by providing context about model substrates,
5
+ products, and data locations.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import List, Dict, Optional, Any
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def load_campaigns_from_file(campaign_file: Path) -> List[Dict[str, Any]]:
17
+ """Load campaign information from a JSON file.
18
+
19
+ Args:
20
+ campaign_file: Path to campaigns.json file
21
+
22
+ Returns:
23
+ List of campaign dictionaries
24
+ """
25
+ if not campaign_file.exists():
26
+ logger.warning(f"Campaign file not found: {campaign_file}")
27
+ return []
28
+
29
+ try:
30
+ with open(campaign_file, 'r') as f:
31
+ campaigns = json.load(f)
32
+ logger.info(f"Loaded {len(campaigns)} campaigns from {campaign_file}")
33
+ return campaigns
34
+ except Exception as e:
35
+ logger.error(f"Failed to load campaigns from {campaign_file}: {e}")
36
+ return []
37
+
38
+
39
+ def find_campaign_by_id(campaigns: List[Dict[str, Any]], campaign_id: str) -> Optional[Dict[str, Any]]:
40
+ """Find a specific campaign by ID.
41
+
42
+ Args:
43
+ campaigns: List of campaign dictionaries
44
+ campaign_id: Campaign ID to search for
45
+
46
+ Returns:
47
+ Campaign dictionary if found, None otherwise
48
+ """
49
+ for campaign in campaigns:
50
+ if campaign.get('campaign_id') == campaign_id:
51
+ return campaign
52
+ return None
53
+
54
+
55
+ def get_campaign_context(campaign: Dict[str, Any]) -> str:
56
+ """Generate context string for prompts from campaign information.
57
+
58
+ Args:
59
+ campaign: Campaign dictionary
60
+
61
+ Returns:
62
+ Formatted context string for inclusion in prompts
63
+ """
64
+ context_parts = []
65
+
66
+ # Basic campaign info
67
+ context_parts.append(f"Campaign: {campaign.get('campaign_name', 'Unknown')}")
68
+ context_parts.append(f"Description: {campaign.get('description', '')}")
69
+
70
+ # Model reaction info
71
+ if campaign.get('model_substrate'):
72
+ context_parts.append(f"Model Substrate: {campaign['model_substrate']} (ID: {campaign.get('substrate_id', 'unknown')})")
73
+ if campaign.get('model_product'):
74
+ context_parts.append(f"Model Product: {campaign['model_product']} (ID: {campaign.get('product_id', 'unknown')})")
75
+
76
+ # Data locations
77
+ if campaign.get('data_locations'):
78
+ locations = ', '.join(campaign['data_locations'])
79
+ context_parts.append(f"Key Data Locations: {locations}")
80
+
81
+ # Lineage hint if available
82
+ if campaign.get('lineage_hint'):
83
+ context_parts.append(f"Evolution Pathway: {campaign['lineage_hint']}")
84
+
85
+ # Additional notes
86
+ if campaign.get('notes'):
87
+ context_parts.append(f"Notes: {campaign['notes']}")
88
+
89
+ return '\n'.join(context_parts)
90
+
91
+
92
+ def get_location_hints_for_campaign(campaign: Dict[str, Any]) -> List[str]:
93
+ """Extract specific location hints from campaign data.
94
+
95
+ Args:
96
+ campaign: Campaign dictionary
97
+
98
+ Returns:
99
+ List of location strings (e.g., ["Figure 2a", "Table S4"])
100
+ """
101
+ return campaign.get('data_locations', [])
102
+
103
+
104
+ def enhance_prompt_with_campaign(prompt: str, campaign: Optional[Dict[str, Any]],
105
+ section_name: str = "CAMPAIGN CONTEXT") -> str:
106
+ """Enhance a prompt with campaign context information.
107
+
108
+ Args:
109
+ prompt: Original prompt
110
+ campaign: Campaign dictionary (optional)
111
+ section_name: Section header for the campaign context
112
+
113
+ Returns:
114
+ Enhanced prompt with campaign context
115
+ """
116
+ if not campaign:
117
+ return prompt
118
+
119
+ context = get_campaign_context(campaign)
120
+ locations = get_location_hints_for_campaign(campaign)
121
+
122
+ campaign_section = f"\n\n{section_name}:\n{'-' * 50}\n{context}"
123
+
124
+ if locations:
125
+ campaign_section += f"\n\nIMPORTANT: Focus particularly on these locations: {', '.join(locations)}"
126
+
127
+ campaign_section += f"\n{'-' * 50}\n"
128
+
129
+ # Insert campaign context early in the prompt
130
+ # Look for a good insertion point after initial instructions
131
+ lines = prompt.split('\n')
132
+ insert_idx = 0
133
+
134
+ # Find a good place to insert (after first paragraph or instruction block)
135
+ for i, line in enumerate(lines):
136
+ if i > 5 and (not line.strip() or line.startswith('Given') or line.startswith('You')):
137
+ insert_idx = i
138
+ break
139
+
140
+ if insert_idx == 0:
141
+ # Fallback: just prepend
142
+ return campaign_section + prompt
143
+ else:
144
+ # Insert at found position
145
+ lines.insert(insert_idx, campaign_section)
146
+ return '\n'.join(lines)
@@ -0,0 +1,39 @@
1
+ """Universal caption pattern for all DEBase extractors.
2
+
3
+ This module provides a consistent caption pattern that handles various
4
+ formats found in scientific papers, including:
5
+ - Standard formats: Figure 1, Fig. 1, Table 1
6
+ - Supplementary formats: Supplementary Figure 1, Supp. Table 1
7
+ - Extended data: Extended Data Figure 1, ED Fig. 1
8
+ - Other types: Scheme 1, Chart 1
9
+ - Page headers: S14 Table 5
10
+ - Various punctuation: Figure 1. Figure 1: Figure 1 |
11
+ """
12
+
13
+ import re
14
+
15
+ # Universal caption pattern that handles all common formats
16
+ UNIVERSAL_CAPTION_PATTERN = re.compile(
17
+ r"""
18
+ ^ # Start of line
19
+ [^\n]{0,20}? # Up to 20 chars of any content (page headers, etc.)
20
+ ( # Start capture group
21
+ (?:Extended\s+Data\s+)? # Optional "Extended Data" prefix
22
+ (?:ED\s+)? # Optional "ED" prefix
23
+ (?:Supplementary|Supp\.?|Suppl\.?)?\s* # Optional supplementary prefixes
24
+ (?:Table|Fig(?:ure)?|Scheme|Chart) # Main caption types
25
+ ) # End capture group
26
+ (?: # Non-capturing group for what follows
27
+ \s* # Optional whitespace
28
+ (?:S?\d+[A-Za-z]?|[IVX]+) # Number (with optional S prefix or roman)
29
+ (?:[.:|]|\s+\|)? # Optional punctuation (. : or |)
30
+ | # OR
31
+ \. # Just a period (for "Fig." without number)
32
+ )
33
+ """,
34
+ re.I | re.X | re.M
35
+ )
36
+
37
+ def get_universal_caption_pattern():
38
+ """Get the universal caption pattern for use in extractors."""
39
+ return UNIVERSAL_CAPTION_PATTERN
@@ -28,6 +28,13 @@ import fitz
28
28
  import re
29
29
  import json
30
30
  import time
31
+
32
+ # Import universal caption pattern
33
+ try:
34
+ from .caption_pattern import get_universal_caption_pattern
35
+ except ImportError:
36
+ # Fallback if running as standalone script
37
+ from caption_pattern import get_universal_caption_pattern
31
38
  import logging
32
39
  from pathlib import Path
33
40
  from dataclasses import dataclass, field
@@ -113,17 +120,8 @@ _DOI_REGEX = re.compile(r"10\.[0-9]{4,9}/[-._;()/:A-Z0-9]+", re.I)
113
120
  # PDB ID regex - matches 4-character PDB codes
114
121
  _PDB_REGEX = re.compile(r"\b[1-9][A-Z0-9]{3}\b")
115
122
 
116
- # Improved caption prefix regex - captures most journal variants
117
- _CAPTION_PREFIX_RE = re.compile(
118
- r"""
119
- ^\s*
120
- (?:Fig(?:ure)?|Extended\s+Data\s+Fig|ED\s+Fig|Scheme|Chart|
121
- Table|Supp(?:lementary|l|\.?)\s+(?:Fig(?:ure)?|Table)) # label part
122
- \s*(?:S?\d+[A-Za-z]?|[IVX]+) # figure number
123
- [.:]?\s* # trailing punctuation/space
124
- """,
125
- re.I | re.X,
126
- )
123
+ # Use universal caption pattern
124
+ _CAPTION_PREFIX_RE = get_universal_caption_pattern()
127
125
 
128
126
 
129
127
  def _open_doc(pdf_path: str | Path | bytes):
@@ -956,6 +954,9 @@ def identify_evolution_locations(
956
954
  campaign_context = f"\nYou are looking for lineage data for a SPECIFIC campaign:\n- Campaign: {camp.campaign_name}\n- Description: {camp.description}\n"
957
955
  if hasattr(camp, 'notes') and camp.notes:
958
956
  campaign_context += f"- Key identifiers: {camp.notes}\n"
957
+ if hasattr(camp, 'data_locations') and camp.data_locations:
958
+ campaign_context += f"- KNOWN DATA LOCATIONS: {', '.join(camp.data_locations)}\n"
959
+ campaign_context += " IMPORTANT: Prioritize these known locations highly!\n"
959
960
  campaign_specific = f" for the '{camp.campaign_name}' campaign"
960
961
  campaign_field = '\n- "campaign_id": "{}" (optional - include if this location is specific to one campaign)'.format(camp.campaign_id)
961
962
  campaign_example = f', "campaign_id": "{camp.campaign_id}"'
@@ -964,7 +965,10 @@ def identify_evolution_locations(
964
965
  campaign_context = "\nThis manuscript contains multiple directed evolution campaigns:\n"
965
966
  for camp in campaigns:
966
967
  campaign_context += f"- {camp.campaign_id}: {camp.campaign_name} - {camp.description}\n"
968
+ if hasattr(camp, 'data_locations') and camp.data_locations:
969
+ campaign_context += f" Known locations: {', '.join(camp.data_locations)}\n"
967
970
  campaign_context += "\nFind locations that contain lineage data for ANY of these campaigns.\n"
971
+ campaign_context += "IMPORTANT: Prioritize the known locations listed above!\n"
968
972
  campaign_specific = " for any of the identified campaigns"
969
973
  campaign_field = '\n- "campaign_id": "string" (optional - include if this location is specific to one campaign)'
970
974
  campaign_example = ', "campaign_id": "campaign_id_here"'
@@ -1041,6 +1045,7 @@ def extract_complete_lineage(
1041
1045
  campaign_id: Optional[str] = None,
1042
1046
  campaign_info: Optional[Campaign] = None,
1043
1047
  pdf_paths: Optional[List[Path]] = None,
1048
+ location_str: Optional[str] = None,
1044
1049
  ) -> List[Variant]:
1045
1050
  """Prompt Gemini for the full lineage and return a list[Variant]."""
1046
1051
  # Build campaign context
@@ -1060,6 +1065,21 @@ IMPORTANT:
1060
1065
  2. Include "campaign_id": "{campaign_info.campaign_id}" for each variant in your response.
1061
1066
  3. Use the lineage hint pattern above to identify which variants belong to this campaign.
1062
1067
  4. Include parent variants only if they are direct ancestors in this campaign's lineage.
1068
+ """
1069
+
1070
+ # Add location context if provided
1071
+ location_context = ""
1072
+ if location_str:
1073
+ location_context = f"""
1074
+
1075
+ LOCATION CONTEXT:
1076
+ You are extracting data SPECIFICALLY from: {location_str}
1077
+
1078
+ CRITICAL INSTRUCTIONS:
1079
+ - ONLY extract enzyme variants that appear in {location_str}
1080
+ - DO NOT include variants from other figures, tables, or sections
1081
+ - If {location_str} references variants from other locations, DO NOT include those unless they are explicitly shown in {location_str}
1082
+ - Focus strictly on the data presented within the boundaries of {location_str}
1063
1083
  """
1064
1084
 
1065
1085
  # Extract table of contents from PDFs if available
@@ -1096,8 +1116,11 @@ IMPORTANT:
1096
1116
  # Include TOC in the prompt text
1097
1117
  combined_text = toc_text + text if toc_text else text
1098
1118
 
1119
+ # Combine campaign and location context
1120
+ full_context = campaign_context + location_context
1121
+
1099
1122
  prompt = _LINEAGE_EXTRACT_PROMPT.format(
1100
- campaign_context=campaign_context,
1123
+ campaign_context=full_context,
1101
1124
  schema=_LINEAGE_SCHEMA_HINT,
1102
1125
  text=combined_text[:MAX_CHARS],
1103
1126
  )
@@ -1705,7 +1728,8 @@ def get_lineage(
1705
1728
  debug_dir=debug_dir,
1706
1729
  campaign_id=campaign.campaign_id,
1707
1730
  campaign_info=campaign,
1708
- pdf_paths=pdf_paths
1731
+ pdf_paths=pdf_paths,
1732
+ location_str=location_str
1709
1733
  )
1710
1734
  if variants:
1711
1735
  log.info(f"Extracted {len(variants)} variants from {location_type}")
@@ -3364,6 +3388,9 @@ Only match variants that represent the SAME enzyme, accounting for different nam
3364
3388
  Return ONLY a JSON object mapping lineage IDs to sequence IDs.
3365
3389
  Format: {{"lineage_id": "sequence_id", ...}}
3366
3390
  Only include matches you are confident represent the same variant.
3391
+
3392
+ DO NOT include any explanation, reasoning, or text other than the JSON object.
3393
+ Response must be valid JSON that starts with {{ and ends with }}
3367
3394
  """
3368
3395
 
3369
3396
  try:
@@ -3406,17 +3433,28 @@ Only include matches you are confident represent the same variant.
3406
3433
  log.error(f"Full cleaned text: {text}")
3407
3434
  # Try to extract JSON from within the response
3408
3435
  import re
3409
- json_match = re.search(r'\{.*\}', text, re.DOTALL)
3410
- if json_match:
3436
+ # First try to find JSON in code blocks
3437
+ code_block_match = re.search(r'```json\s*(\{[^`]*\})\s*```', text, re.DOTALL)
3438
+ if code_block_match:
3411
3439
  try:
3412
- matches = json.loads(json_match.group(0))
3413
- log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
3440
+ matches = json.loads(code_block_match.group(1))
3441
+ log.info(f"Successfully extracted JSON from code block: {len(matches)} matches")
3414
3442
  except json.JSONDecodeError:
3415
- log.error("Failed to extract JSON from response")
3443
+ log.error("Failed to parse JSON from code block")
3416
3444
  matches = {}
3417
3445
  else:
3418
- log.error("No JSON object found in response")
3419
- matches = {}
3446
+ # Try to find standalone JSON object (non-greedy, looking for balanced braces)
3447
+ json_match = re.search(r'(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})', text)
3448
+ if json_match:
3449
+ try:
3450
+ matches = json.loads(json_match.group(1))
3451
+ log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
3452
+ except json.JSONDecodeError:
3453
+ log.error("Failed to extract JSON from response")
3454
+ matches = {}
3455
+ else:
3456
+ log.error("No JSON object found in response")
3457
+ matches = {}
3420
3458
 
3421
3459
  # Create a mapping of sequence IDs to their data for efficient lookup
3422
3460
  seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}