PyPI - debase - Versions diffs - 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

debase 0.6.1py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

debase/_version.py +1 -1
debase/caption_pattern.py +7 -2
debase/cleanup_sequence.py +34 -6
debase/enzyme_lineage_extractor.py +423 -86
debase/lineage_format.py +44 -1
debase/reaction_info_extractor.py +73 -61
debase/substrate_scope_extractor.py +84 -32
{debase-0.6.1.dist-info → debase-0.6.2.dist-info}/METADATA +1 -1
debase-0.6.2.dist-info/RECORD +18 -0
debase-0.6.1.dist-info/RECORD +0 -18
{debase-0.6.1.dist-info → debase-0.6.2.dist-info}/WHEEL +0 -0
{debase-0.6.1.dist-info → debase-0.6.2.dist-info}/entry_points.txt +0 -0
{debase-0.6.1.dist-info → debase-0.6.2.dist-info}/licenses/LICENSE +0 -0
{debase-0.6.1.dist-info → debase-0.6.2.dist-info}/top_level.txt +0 -0

debase/lineage_format.py CHANGED Viewed

@@ -30,6 +30,7 @@ from __future__ import annotations
 import argparse
 import csv
+import difflib
 import json
 import logging
 import os
@@ -726,7 +727,49 @@ Only include matches you are confident about. If no match exists, omit that enzy
                             gemini_matched_count += 1
                             log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
                         else:
-                            log.warning(f"Gemini suggested match '{matched_id}' not found in sequence lookup")
+                            # Try fuzzy matching when exact match fails
+                            best_match = None
+                            best_score = 0
+                            # Try all possible keys in seq_lookup
+                            for key in seq_lookup.keys():
+                                if campaign_id in key:  # Only consider keys from same campaign
+                                    # Extract enzyme_id part from composite key
+                                    try:
+                                        _, key_enzyme_id = key.split('_', 1)
+                                    except ValueError:
+                                        continue
+                                    # Calculate similarity score
+                                    score = difflib.SequenceMatcher(None, matched_id.lower(), key_enzyme_id.lower()).ratio()
+                                    # Always track the highest score
+                                    if score > best_score:
+                                        best_score = score
+                                        best_match = key
+                            # Use the best match regardless of threshold (let user see the score)
+                            if best_match and best_score > 0.5:  # Lower threshold but log the score
+                                idx = entry["idx"]
+                                df.at[idx, "protein_sequence"] = seq_lookup[best_match]["aa_sequence"]
+                                df.at[idx, "aa_sequence"] = seq_lookup[best_match]["aa_sequence"]
+                                if seq_lookup[best_match]["nt_sequence"]:
+                                    df.at[idx, "nucleotide_sequence"] = seq_lookup[best_match]["nt_sequence"]
+                                    df.at[idx, "nt_sequence"] = seq_lookup[best_match]["nt_sequence"]
+                                # Also copy generation and parent_enzyme_id
+                                df.at[idx, "generation"] = seq_lookup[best_match]["generation"]
+                                df.at[idx, "parent_enzyme_id"] = seq_lookup[best_match]["parent_enzyme_id"]
+                                # Store the match for later mutation copying
+                                _, matched_enzyme = best_match.split('_', 1)
+                                df.at[idx, "_matched_enzyme_id"] = matched_enzyme
+                                df.at[idx, "_matched_campaign_id"] = campaign_id
+                                gemini_matched_count += 1
+                                log.info(f"Fuzzy matched '{substrate_id}' -> '{matched_enzyme}' (score: {best_score:.2f}) in campaign {campaign_id}")
+                            else:
+                                log.warning(f"No fuzzy match found for Gemini suggested '{matched_id}' in campaign {campaign_id} (best score: {best_score:.2f})")
             except Exception as e:
                 log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")

debase/reaction_info_extractor.py CHANGED Viewed

@@ -442,7 +442,13 @@ PROMPT_FIND_LOCATIONS = dedent("""
 You are an expert reader of protein engineering manuscripts.
 Given the following article captions and section titles, identify most promising locations
 (tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
-activity, etc.) for enzyme variants. Use your best judgement to include location showing full evolution lineage data.
+activity, etc.) for enzyme variants.
+CRITICAL PRIORITY: FULL EVOLUTION LINEAGE DATA IS REQUIRED
+- Look for locations showing data for ALL enzyme variants in the evolution lineage
+- Prioritize sources that show the complete evolutionary progression (parent → child variants)
+- Look for captions mentioning "sequentially evolved", "evolution lineage", "rounds of evolution", "directed evolution progression"
+- Sources showing data for individual variants only (e.g., just the final variant) are LESS VALUABLE than complete lineage data
 IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
 performance data locations. Pay careful attention to:
@@ -450,8 +456,13 @@ performance data locations. Pay careful attention to:
 - Enzyme name prefixes that indicate different campaigns
 - Different substrate/product types mentioned in captions
+IMPORTANT FIGURE REFERENCE RULES:
+- For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
+- The extraction system will handle retrieving the entire figure including all sub-panels
+- For tables, return the complete reference as it appears
 Respond with a JSON array where each element contains:
-- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2")
+- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2", NOT "Figure 3a")
 - "type": one of "table", "figure"
 - "confidence": your confidence score (0-100)
 - "caption": the exact caption text for this location
@@ -459,7 +470,12 @@ Respond with a JSON array where each element contains:
 - "lineage_hint": any indication of which enzyme group this data is for (or null)
 - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
-Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
+PRIORITIZATION RULES:
+- HIGHEST PRIORITY: Sources showing COMPLETE evolution lineage data (all variants in progression)
+- MEDIUM PRIORITY: Sources showing data for multiple variants (but not complete lineage)
+- LOWEST PRIORITY: Sources showing data for individual variants only
+Tables are generally preferred over figures unless you are convinced that only the figure contains complete lineage reaction matrix information. Some tables don't have performance data, check provided context of the specific table.
 IMPORTANT FOR TABLES: When evaluating a table, check if the context below the table shows performance values (TTN, yield, ee, etc.). If the table caption mentions enzymes but the table only shows mutations/sequences, look for performance data in the text immediately following the table. If context below the table shows numerical values, use the table location as it likely contains the referenced data.
@@ -503,6 +519,13 @@ IMPORTANT:
 - If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
 - If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
+CRITICAL: DO NOT CONFUSE DIFFERENT METRICS:
+- Yield (%) measures how much product was formed (0-100%)
+- Selectivity/ee (%) measures enantiomeric excess - the stereoselectivity of the reaction
+- TTN (number) measures total turnovers - how many substrate molecules each enzyme converts
+- These are COMPLETELY DIFFERENT values - a reaction might have 95% yield but 85% ee and 1000 TTN
+- Be extremely careful when extracting from tables/figures with multiple columns or data series
 Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
 """)
@@ -530,6 +553,17 @@ STEP 4: Extract values for each matched variant
 - CRITICAL: Read actual scale values from the axis labels and tick marks
 - Verify: taller bars should have higher values, higher dots should have higher values
+CRITICAL DATA ACCURACY REQUIREMENTS:
+- DO NOT CONFUSE yield with selectivity (ee) with TTN values - these are completely different metrics
+- Yield is typically shown as percentage (0-100%)
+- Selectivity/ee is enantiomeric excess, also shown as percentage but measures stereoselectivity
+- TTN (Total Turnover Number) is the number of substrate molecules converted per enzyme molecule
+- Each enzyme variant should have its OWN yield, ee, and TTN values - do not mix values between variants
+- Carefully match each bar/dot to its corresponding enzyme label on the X-axis
+- If looking at grouped bars, ensure you're reading the correct bar for each metric
+- Double-check that variant A's yield is not confused with variant B's yield
+- If values are unclear or ambiguous, return null rather than guessing
 Target enzymes to find and extract:
 {enzyme_names}
@@ -887,15 +921,22 @@ class ReactionExtractor:
             campaign_context = f"""
             IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
-            Campaign Details:
+            CAMPAIGN DETAILS FROM CAMPAIGNS.JSON:
+            - Campaign ID: {self.campaign_info.get('campaign_id', '')}
             - Name: {self.campaign_info.get('campaign_name', '')}
             - Description: {self.campaign_info.get('description', '')}
             - Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
             - Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
+            - Notes: {self.campaign_info.get('notes', '')}
             KNOWN DATA LOCATIONS FOR THIS CAMPAIGN: {', '.join(location_hints)}
             These locations are known to contain relevant data - prioritize them highly.
+            CRITICAL REQUIREMENT: For this campaign, you must find locations that contain COMPLETE EVOLUTION LINEAGE DATA.
+            - Look for data showing the entire evolutionary progression of enzyme variants
+            - Prioritize locations that show performance data for ALL variants in the lineage
+            - The campaign description and notes above provide context about the evolution strategy used
             {f"ALL CAMPAIGNS IN THIS PAPER: {chr(10).join([f'- {c}' for c in self.all_campaigns])}" if self.all_campaigns else ""}
             CRITICAL: Only return locations that contain data for this specific campaign.
@@ -2558,41 +2599,17 @@ Do NOT include compound information from other campaigns.
             if not mapping or not mapping.iupac_name:
                 missing_compounds.append(cid)
-        # Tier 2 (skip directly to full search): Full manuscript + SI search with all available figures
+        # Tier 2 (skip directly to full search): Full manuscript + SI search WITHOUT figures
         if missing_compounds:
-            LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
+            LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full text search: %s",
                        len(missing_compounds), sorted(missing_compounds))
-            # Get all available figures for compound structure analysis
-            figure_images = {}
-            # Extract main manuscript figures
-            figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
-            for ref in figure_refs:
-                img_b64 = self._extract_page_png(ref, extract_figure_only=True)
-                if img_b64:
-                    figure_images[ref] = img_b64
-                    LOGGER.info("Retrieved %s for compound mapping", ref)
-            # Get SI figures
-            si_figure_refs = []
-            for page in self.si_pages[:10]:  # Check first 10 SI pages
-                matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
-                si_figure_refs.extend(matches[:10])  # Limit to 10 figures
-            # Extract SI figures
-            for ref in set(si_figure_refs):
-                if ref not in figure_images:
-                    img_b64 = self._extract_page_png(ref, extract_figure_only=True)
-                    if img_b64:
-                        figure_images[ref] = img_b64
-                        LOGGER.info("Extracted %s for compound mapping", ref)
             # Full text search including ALL pages (manuscript + SI)
             full_text = "\n\n".join(self.all_pages)  # Send everything
-            final_mappings = self._extract_compound_mappings_with_figures(
-                full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
+            # Use text-only extraction for Tier 2 (no images)
+            final_mappings = self._extract_compound_mappings_from_text(
+                full_text[:100000], missing_compounds, tag_suffix="tier2", campaign_filter=campaign_filter
             )
             # Merge final mappings with better compound ID matching
@@ -2933,34 +2950,6 @@ Different campaigns may use different model reactions and substrates.
                     LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
                                list(compound_mappings.keys()))
-                    # First, populate IUPAC lists directly from compound mappings based on compound_type
-                    substrate_iupacs_from_mappings = []
-                    product_iupacs_from_mappings = []
-                    for mapping in compound_mappings.values():
-                        if mapping.iupac_name and mapping.compound_type:
-                            if mapping.compound_type.lower() == "substrate":
-                                substrate_iupacs_from_mappings.append(mapping.iupac_name)
-                                LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
-                            elif mapping.compound_type.lower() == "product":
-                                product_iupacs_from_mappings.append(mapping.iupac_name)
-                                LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
-                    # Initialize or update the IUPAC lists with mapped compounds
-                    if substrate_iupacs_from_mappings:
-                        existing_substrates = data.get("substrate_iupac_list", []) or []
-                        if isinstance(existing_substrates, list):
-                            data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
-                        else:
-                            data["substrate_iupac_list"] = substrate_iupacs_from_mappings
-                    if product_iupacs_from_mappings:
-                        existing_products = data.get("product_iupac_list", []) or []
-                        if isinstance(existing_products, list):
-                            data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
-                        else:
-                            data["product_iupac_list"] = product_iupacs_from_mappings
                     # Try to map substrate/product lists through compound IDs
                     substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
                     if isinstance(substrate_list, list):
@@ -3571,6 +3560,11 @@ def main() -> None:
         LOGGER.info("Loading enzyme data from CSV…")
         enzyme_df = pd.read_csv(args.lineage_csv)
+        # Rename enzyme_id to enzyme if needed
+        if "enzyme_id" in enzyme_df.columns and "enzyme" not in enzyme_df.columns:
+            enzyme_df = enzyme_df.rename(columns={"enzyme_id": "enzyme"})
+            LOGGER.info("Renamed 'enzyme_id' column to 'enzyme' in lineage data")
         # Detect campaign information from the enzyme CSV
         if 'campaign_id' in enzyme_df.columns:
             all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
@@ -3601,6 +3595,11 @@ def main() -> None:
                                             campaign_info=campaign_info)
                 df_metrics = extractor.run(enzyme_df)
+                # For single campaign, also merge with lineage data
+                if not df_metrics.empty:
+                    df_metrics = df_metrics.merge(enzyme_df, on='enzyme', how='left', suffixes=('', '_lineage'))
+                    LOGGER.info("Merged metrics with lineage data for single campaign")
             elif len(all_campaigns) > 1:
                 LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
                 all_results = []
@@ -3651,6 +3650,10 @@ def main() -> None:
                         # Merge campaign metrics with lineage data
                         campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
+                        # Rename aa_seq to protein_sequence for consistency
+                        if 'aa_seq' in campaign_final.columns:
+                            campaign_final = campaign_final.rename(columns={'aa_seq': 'protein_sequence'})
                         # Save campaign-specific file immediately
                         output_dir = args.output.parent
                         base_name = args.output.stem
@@ -3667,6 +3670,10 @@ def main() -> None:
                         # Still save an empty campaign file with lineage data
                         campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
                         if not campaign_lineage.empty:
+                            # Rename aa_seq to protein_sequence for consistency
+                            if 'aa_seq' in campaign_lineage.columns:
+                                campaign_lineage = campaign_lineage.rename(columns={'aa_seq': 'protein_sequence'})
                             output_dir = args.output.parent
                             base_name = args.output.stem
                             campaign_file = output_dir / f"{base_name}_{campaign}.csv"
@@ -3697,6 +3704,11 @@ def main() -> None:
     df_final = df_metrics
     LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
+    # Rename aa_seq to protein_sequence for consistency
+    if df_final is not None and 'aa_seq' in df_final.columns:
+        df_final = df_final.rename(columns={'aa_seq': 'protein_sequence'})
+        LOGGER.info("Renamed 'aa_seq' column to 'protein_sequence' for consistency")
     df_final.to_csv(args.output, index=False)
     LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)

debase/substrate_scope_extractor.py CHANGED Viewed

@@ -296,12 +296,14 @@ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -
     return "\n".join(chunks)
-def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str]:
+def extract_figure_image(pdf_paths: List[Path], figure_ref: str, caption_hint: Optional[str] = None, document_hint: Optional[str] = None) -> Optional[str]:
     """Extract figure as a page region when embedded images aren't available.
     Args:
         pdf_paths: List of PDF paths to search
         figure_ref: Figure reference to search for (e.g., "Figure 3" or "Figure 3(a)")
+        caption_hint: Optional caption text to help identify the exact figure
+        document_hint: Optional hint about which document to search ("manuscript" or "supplementary")
     Returns:
         Base64-encoded PNG string or None if not found
@@ -318,8 +320,20 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
         log.info("Extracting entire figure '%s' from reference '%s'", base_figure_ref, figure_ref)
     else:
         base_figure_ref = figure_ref
-    for pdf_path in pdf_paths:
+    # Determine search order based on document hint
+    search_paths = list(pdf_paths)  # Create a copy
+    if document_hint and len(pdf_paths) > 1:
+        if document_hint.lower() == "manuscript":
+            # Prioritize manuscript (first PDF)
+            search_paths = [pdf_paths[0]] + pdf_paths[1:]
+            log.info("Prioritizing manuscript document for '%s' (hint: %s)", figure_ref, document_hint)
+        elif document_hint.lower() == "supplementary":
+            # Prioritize SI (second PDF if available)
+            search_paths = [pdf_paths[1], pdf_paths[0]] if len(pdf_paths) > 1 else pdf_paths
+            log.info("Prioritizing supplementary document for '%s' (hint: %s)", figure_ref, document_hint)
+    for pdf_path in search_paths:
         doc = _open_doc(pdf_path)
         try:
             for page_num in range(doc.page_count):
@@ -333,26 +347,38 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
                 # Extract figure number (e.g., "Figure 3" -> "3", "Figure S3" -> "S3")
                 figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '')
-                # Look for actual figure captions using regex patterns
-                caption_patterns = [
-                    rf"^Figure\s+{re.escape(figure_num)}\.",  # "Figure 3." at start of line
-                    rf"^Figure\s+{re.escape(figure_num)}:",   # "Figure 3:" at start of line
-                    rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]",  # "Figure 3 Substrate scope"
-                    rf"Figure\s+{re.escape(figure_num)}\s*\.",  # "Figure 3." anywhere
-                    rf"Figure\s+{re.escape(figure_num)}\s*:",  # "Figure 3:" anywhere
-                ]
-                for pattern in caption_patterns:
-                    matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
-                    if matches:
-                        # Found actual figure caption, get its position
-                        caption_text = matches.group(0)
-                        caption_instances = page.search_for(caption_text, quads=False)
+                # First try to find using caption hint if provided
+                if caption_hint and len(caption_hint) > 10:
+                    # Try to find the exact caption text
+                    caption_snippet = caption_hint[:100]  # Use first 100 chars
+                    if caption_snippet in page_text:
+                        caption_instances = page.search_for(caption_snippet, quads=False)
                         if caption_instances:
                             caption_rect = caption_instances[0]
                             found = True
-                            log.info("Found actual figure caption '%s' on page %d", caption_text, page_num + 1)
-                            break
+                            log.info("Found figure using caption hint on page %d", page_num + 1)
+                # If not found with hint, look for actual figure captions using regex patterns
+                if not found:
+                    caption_patterns = [
+                        rf"^Figure\s+{re.escape(figure_num)}\.",  # "Figure 3." at start of line
+                        rf"^Figure\s+{re.escape(figure_num)}:",   # "Figure 3:" at start of line
+                        rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]",  # "Figure 3 Substrate scope"
+                        rf"Figure\s+{re.escape(figure_num)}\s*\.",  # "Figure 3." anywhere
+                        rf"Figure\s+{re.escape(figure_num)}\s*:",  # "Figure 3:" anywhere
+                    ]
+                    for pattern in caption_patterns:
+                        matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
+                        if matches:
+                            # Found actual figure caption, get its position
+                            caption_text = matches.group(0)
+                            caption_instances = page.search_for(caption_text, quads=False)
+                            if caption_instances:
+                                caption_rect = caption_instances[0]
+                                found = True
+                                log.info("Found actual figure caption '%s' on page %d", caption_text, page_num + 1)
+                                break
                 if not found:
                     continue
@@ -1135,17 +1161,24 @@ Your task is to:
 4. Note that not all campaigns have substrate scope data - it's okay to return empty results if no substrate scope data exists for this campaign
 5. Determine which enzyme variants from this campaign were tested in substrate scope studies
+IMPORTANT FIGURE REFERENCE RULES:
+- For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
+- Include the figure caption if available to help with identification
+- The extraction system will handle retrieving the entire figure including all sub-panels
 Return your analysis as JSON array (max {max_results} locations, or empty array if no substrate scope data for this campaign):
 [
   {{
-    "location": "Description of where the data is found",
+    "location": "Main figure/table reference (e.g., 'Figure 2', 'Table S1', NOT 'Figure 2a')",
     "type": "table|figure|text",
     "confidence": 0.0-1.0,
     "enzyme_variants": ["list of enzyme IDs found"],
     "substrates_tested": ["list of substrates if identifiable"],
     "campaign_match": true/false,
     "is_substrate_scope": true/false,
-    "model_reaction_excluded": "reason why this is not a model reaction"
+    "model_reaction_excluded": "reason why this is not a model reaction",
+    "caption": "Include the figure/table caption if available",
+    "document": "manuscript|supplementary - specify whether this location is in the main manuscript or supplementary information"
   }}
 ]
@@ -1865,22 +1898,28 @@ def extract_substrate_scope_entries_for_campaign(
     all_refs = []
     if locations:
-        # Include ALL locations, not just primary
-        location_strs = []
-        for loc in locations[:3]:  # Up to 3 locations
-            loc_str = loc.get('location', '')
-            location_strs.append(loc_str)
-            all_refs.append(loc_str)
+        # Sort locations by confidence and use only the PRIMARY (most confident) location
+        sorted_locations = sorted(locations, key=lambda x: x.get('confidence', 0), reverse=True)
+        primary_location = sorted_locations[0] if sorted_locations else None
-        extraction_hints = f"\nSubstrate scope locations for campaign {campaign_id}: {', '.join(location_strs)}"
+        if primary_location:
+            primary_ref = primary_location.get('location', '')
+            all_refs = [primary_ref]  # Only extract from primary location
+            extraction_hints = f"\nPRIMARY substrate scope location for campaign {campaign_id}: {primary_ref}"
+            extraction_hints += f"\nLocation confidence: {primary_location.get('confidence', 0)}%"
+            extraction_hints += f"\nLocation type: {primary_location.get('type', 'unknown')}"
         # Focus on campaign-specific enzyme variants
         extraction_hints += f"\nTarget enzymes for this campaign: {', '.join(enzyme_ids)}"
-    # Extract text from ALL identified locations (like the original function did)
+    # Extract text from ONLY the primary location
     extraction_texts = []
     figure_images = {}
+    # Create a mapping of location strings to their full location data
+    location_map = {loc.get('location', ''): loc for loc in locations}
     for ref in all_refs:
         if ref and pdf_paths:
             ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
@@ -1889,7 +1928,10 @@ def extract_substrate_scope_entries_for_campaign(
                 # Extract figure images for this reference (crop page around figure)
                 try:
-                    fig_base64 = extract_figure_image(pdf_paths, ref)
+                    # Get caption and document hints if available
+                    caption_hint = location_map.get(ref, {}).get('caption', '')
+                    document_hint = location_map.get(ref, {}).get('document', '')
+                    fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
                     if fig_base64:
                         figure_images[ref] = fig_base64
                         log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)
@@ -1942,6 +1984,14 @@ IMPORTANT INSTRUCTIONS:
 4. Not all campaigns have substrate scope data - if no substrate scope data exists for this campaign, return an empty array
 5. Include all relevant reaction performance data (yield, ee, ttn, etc.)
+CRITICAL DATA ACCURACY REQUIREMENTS:
+- BE EXTREMELY CAREFUL about which substrate ID maps to which yield, TTN, and selectivity values
+- Each substrate entry should have its OWN yield, ee, and TTN values - do not mix up values between substrates
+- If looking at a table or figure, carefully match each substrate with its corresponding row/bar/data point
+- Double-check that substrate 1a's data is not confused with substrate 1b's data, etc.
+- If values are unclear or ambiguous for a specific substrate, return null rather than guessing
+- Pay special attention when extracting from figures - ensure you're reading the correct bar/point for each substrate
 {extraction_hints}
 Return your analysis as JSON in this format:
@@ -2287,13 +2337,15 @@ def get_substrate_scope(
         if should_extract:
             figure_ref = location_str
             confidence = loc.get('confidence', 0)
+            caption_hint = loc.get('caption', '')
             log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, location_type)
             # Use appropriate extraction function based on type
             if 'scheme' in location_str.lower() or location_type == 'scheme':
                 figure_image = extract_scheme_image(pdf_paths, figure_ref)
             else:
-                figure_image = extract_figure_image(pdf_paths, figure_ref)
+                document_hint = loc.get('document', '')
+                figure_image = extract_figure_image(pdf_paths, figure_ref, caption_hint=caption_hint, document_hint=document_hint)
             if figure_image:
                 log.info("Successfully extracted %s image for %s (%d bytes)",

{debase-0.6.1.dist-info → debase-0.6.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.6.1
+Version: 0.6.2
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

debase-0.6.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
+debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
+debase/_version.py,sha256=t771GcmZTaJJGrIex6Ea6Q5pcMqVPIihCdRFRA1dMAM,49
+debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
+debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
+debase/caption_pattern.py,sha256=F1cxQxyQDmzw3ogi3zXJp7iEvOdFdIN2kDzLrUg_amE,2043
+debase/cleanup_sequence.py,sha256=XbA0pZFFIJRJf4XCEN-j4s7dnkdXN9mYdbcuz-ZSjg4,75520
+debase/enzyme_lineage_extractor.py,sha256=OXO2jUqAqF0pXrw17oIQERnek1uZ5gsFIuKRz4NMS1o,188556
+debase/lineage_format.py,sha256=YWAP9OhFN3MQWbqk5gguX0C2cCwGvKJAtMq9pG5TJp8,59515
+debase/reaction_info_extractor.py,sha256=kQBxPpzurjHXsHFWE_WM84ArSnc3E8f6xPMJpyTIGnU,188246
+debase/substrate_scope_extractor.py,sha256=hRlt8iWOURmgW4SJHB1Svoh3TTa4fa9YIE8qVUZPnY0,122621
+debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
+debase-0.6.2.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.6.2.dist-info/METADATA,sha256=gnPvTWvazrsdGrIKX8tA4Wwt8yKYph87POVKF25rkkg,4047
+debase-0.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.6.2.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.6.2.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.6.2.dist-info/RECORD,,

debase-0.6.1.dist-info/RECORD DELETED Viewed

@@ -1,18 +0,0 @@
-debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
-debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=Cbfy3WdPDTjtgnzdUc6e5F779YhAJJGX5LN-2SJMvCI,49
-debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
-debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
-debase/caption_pattern.py,sha256=nMLj2tK4MhD4jQ9d1IUDJ6xnY0MOx-UioIT-k_b3OWA,1770
-debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
-debase/enzyme_lineage_extractor.py,sha256=RKsjvcs6O2wnw2dpts3AynDRVKqMAeBVOMql2mayCGY,170120
-debase/lineage_format.py,sha256=BE8uW1XUCmxlcYKiD7QveF4r99xObfGf1vP1rZzJTV8,56525
-debase/reaction_info_extractor.py,sha256=qUrVi9chQcQG1zWwQlTbYF8dczvQqctdjwhvkAkBnZw,187032
-debase/substrate_scope_extractor.py,sha256=dikdEELi4RGlP2lGHcR93WdUbtIchOdHVB5G45BMCNk,118709
-debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
-debase-0.6.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.6.1.dist-info/METADATA,sha256=fXvGhqDP5Bl33gTEvUvvjqNy-cXYs9jYFl1NyM5ALsc,4047
-debase-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.6.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.6.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.6.1.dist-info/RECORD,,

{debase-0.6.1.dist-info → debase-0.6.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.6.1.dist-info → debase-0.6.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.6.1.dist-info → debase-0.6.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.6.1.dist-info → debase-0.6.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

debase 0.6.1py3-none-any.whl → 0.6.2py3-none-any.whl