PyPI - debase - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

debase 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

debase/_version.py +1 -1
debase/enzyme_lineage_extractor.py +14 -8
debase/lineage_format.py +335 -56
debase/reaction_info_extractor.py +60 -32
debase/substrate_scope_extractor.py +373 -140
debase/wrapper.py +37 -11
{debase-0.4.0.dist-info → debase-0.4.2.dist-info}/METADATA +1 -1
debase-0.4.2.dist-info/RECORD +16 -0
debase-0.4.0.dist-info/RECORD +0 -16
{debase-0.4.0.dist-info → debase-0.4.2.dist-info}/WHEEL +0 -0
{debase-0.4.0.dist-info → debase-0.4.2.dist-info}/entry_points.txt +0 -0
{debase-0.4.0.dist-info → debase-0.4.2.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.0.dist-info → debase-0.4.2.dist-info}/top_level.txt +0 -0

debase/substrate_scope_extractor.py CHANGED Viewed

@@ -83,6 +83,7 @@ class ScopeEntry:
     # Metadata
     data_location: Optional[str] = None
     data_source_type: Dict[str, str] = field(default_factory=dict)
+    campaign_id: Optional[str] = None
     # Lineage information (populated during merge)
     parent_id: Optional[str] = None
@@ -309,68 +310,27 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
                 log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
-                # Extract multi-page region including the figure and content below
-                # The figure should be between the top of the viewable area and extend to subsequent pages
+                # Extract just the figure with its caption, avoiding excessive white space
                 page_rect = page.rect
-                # Define the region to extract
-                # Extract everything above the caption plus additional content from subsequent pages
-                top_margin = 0  # Start from the very top of the page
-                additional_pages = 2  # Number of additional pages to include
-                left_margin = 0  # Use full page width
-                right_margin = 0
-                # Calculate the figure region for the first page
-                fig_top = top_margin
-                fig_bottom = max(caption_rect.y0 + 150, page_rect.height)  # At least 150px below caption or full page
-                fig_left = left_margin
-                fig_right = page_rect.width - right_margin
-                # Create list to store all page images
-                page_images = []
+                # Extract the entire page containing the identified location
+                fig_top = 0  # Start from top of page
+                fig_bottom = page_rect.height  # Full page height
+                fig_left = 0  # Full width
+                fig_right = page_rect.width
-                # Extract first page (from top to bottom)
+                # Extract the entire page
                 clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
                 mat = fitz.Matrix(2, 2)  # 2x zoom for better quality
                 pix = page.get_pixmap(clip=clip_rect, matrix=mat)
-                page_images.append(pix)
-                # Extract additional pages if they exist
-                for additional_page_offset in range(1, additional_pages + 1):
-                    next_page_num = page_num + additional_page_offset
-                    if next_page_num < doc.page_count:
-                        next_page = doc.load_page(next_page_num)
-                        next_page_rect = next_page.rect
-                        # Extract full page for additional pages
-                        next_clip_rect = fitz.Rect(0, 0, next_page_rect.width, next_page_rect.height)
-                        next_pix = next_page.get_pixmap(clip=next_clip_rect, matrix=mat)
-                        page_images.append(next_pix)
-                        log.info("Added page %d to multi-page extraction", next_page_num + 1)
-                # Combine all page images vertically
-                if len(page_images) == 1:
-                    # Single page extraction
-                    combined_pix = page_images[0]
-                else:
-                    # Multi-page extraction - combine vertically
-                    total_width = max(pix.width for pix in page_images)
-                    total_height = sum(pix.height for pix in page_images)
-                    # Create a new pixmap to hold the combined image
-                    combined_pix = fitz.Pixmap(fitz.csRGB, fitz.IRect(0, 0, total_width, total_height))
-                    combined_pix.clear_with(255)  # White background
-                    current_y = 0
-                    for pix in page_images:
-                        # Copy each page image to the combined image
-                        combined_pix.copy(pix, fitz.IRect(0, current_y, pix.width, current_y + pix.height))
-                        current_y += pix.height
+                log.info("Extracted entire page: %.0fx%.0f pixels from page %d",
+                         pix.width, pix.height, page_num + 1)
                 # Convert to PNG
-                img_bytes = combined_pix.tobytes("png")
-                log.info("Extracted multi-page figure region: %dx%d pixels from %d pages starting at page %d",
-                         combined_pix.width, combined_pix.height, len(page_images), page_num + 1)
+                img_bytes = pix.tobytes("png")
+                log.info("Converted to PNG: %dx%d pixels from page %d",
+                         pix.width, pix.height, page_num + 1)
                 return b64encode(img_bytes).decode()
@@ -1014,25 +974,73 @@ Return as JSON:
 # ---- 6.2  Helper functions -------------------------------------------------
-def identify_scope_locations(
+def identify_scope_locations_for_campaign(
     text: str,
     model,
+    campaign_id: str,
+    enzyme_ids: List[str],
     *,
     max_results: int = 5,
     debug_dir: str | Path | None = None,
 ) -> List[dict]:
-    """Ask Gemini where substrate scope data is located."""
-    prompt = _SCOPE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + text[:15_000]
+    """Ask Gemini where substrate scope data is located for a specific campaign."""
+    # Simple model reaction context
+    model_reactions_context = """
+IMPORTANT: Substrate scope reactions are those that test DIFFERENT substrates than the model reactions used for evolution.
+Model reactions are used to evolve/optimize enzymes. Substrate scope reactions test evolved enzymes on different substrates.
+"""
+    # Create campaign-specific prompt
+    campaign_prompt = f"""
+You are an expert reader of biocatalysis manuscripts.
+Analyze this paper and identify all locations containing substrate scope data for the specific campaign: "{campaign_id}".
+CAMPAIGN CONTEXT:
+- Campaign ID: {campaign_id}
+- Target enzymes: {', '.join(enzyme_ids)}
+{model_reactions_context}
+Your task is to:
+1. Identify locations (tables, figures, text) containing substrate scope reaction data specifically for this campaign
+2. Focus only on substrate scope studies involving the enzymes: {', '.join(enzyme_ids)}
+3. CRITICAL: Distinguish substrate scope studies from model reactions used for evolution
+   - Model reactions are those used to evolve/optimize the enzymes
+   - Substrate scope reactions test the evolved enzymes on DIFFERENT substrates
+4. Note that not all campaigns have substrate scope data - it's okay to return empty results if no substrate scope data exists for this campaign
+5. Determine which enzyme variants from this campaign were tested in substrate scope studies
+Return your analysis as JSON array (max {max_results} locations, or empty array if no substrate scope data for this campaign):
+[
+  {{
+    "location": "Description of where the data is found",
+    "type": "table|figure|text",
+    "confidence": 0.0-1.0,
+    "enzyme_variants": ["list of enzyme IDs found"],
+    "substrates_tested": ["list of substrates if identifiable"],
+    "campaign_match": true/false,
+    "is_substrate_scope": true/false,
+    "model_reaction_excluded": "reason why this is not a model reaction"
+  }}
+]
+Important: Only return locations that contain TRUE substrate scope data (not model reactions) for the specified campaign and enzymes. If no substrate scope data exists for this campaign, return an empty array.
+"""
+    prompt = campaign_prompt + "\n\nTEXT:\n" + text[:15_000]
     locs: List[dict] = []
     try:
         locs = generate_json_with_retry(
             model,
             prompt,
             debug_dir=debug_dir,
-            tag="scope_locate",
+            tag=f"scope_locate_{campaign_id}",
         )
     except Exception as exc:  # pragma: no cover
-        log.warning("identify_scope_locations(): %s", exc)
+        log.warning("identify_scope_locations_for_campaign(%s): %s", campaign_id, exc)
     return locs if isinstance(locs, list) else []
 def identify_iupac_sections(
@@ -1719,16 +1727,18 @@ def extract_compound_mappings(
     log.info("Total compound mappings extracted: %d", len(mappings))
     return mappings
-def extract_all_substrate_scope_data(
+def extract_substrate_scope_entries_for_campaign(
     text: str,
     model,
     locations: List[dict],
+    campaign_id: str,
+    enzyme_ids: List[str],
     *,
     pdf_paths: List[Path] = None,
-    figure_images: Dict[str, str] = None,
     debug_dir: str | Path | None = None,
 ) -> List[dict]:
-    """Extract all substrate scope data at once from all primary sources."""
+    """Extract substrate scope data specifically for a campaign."""
     extraction_hints = ""
     all_refs = []
@@ -1740,124 +1750,189 @@ def extract_all_substrate_scope_data(
             location_strs.append(loc_str)
             all_refs.append(loc_str)
-        extraction_hints = f"\nSubstrate scope locations: {', '.join(location_strs)}"
-        # Collect all enzyme variants
-        all_variants = []
-        for loc in locations:
-            variants = loc.get('enzyme_variants_tested', [])
-            all_variants.extend(variants)
+        extraction_hints = f"\nSubstrate scope locations for campaign {campaign_id}: {', '.join(location_strs)}"
-        if all_variants:
-            unique_variants = list(set(all_variants))
-            extraction_hints += f"\nEnzyme variants: {', '.join(unique_variants)}"
+        # Focus on campaign-specific enzyme variants
+        extraction_hints += f"\nTarget enzymes for this campaign: {', '.join(enzyme_ids)}"
-    # Extract text from ALL identified locations
+    # Extract text from ALL identified locations (like the original function did)
     extraction_texts = []
+    figure_images = {}
     for ref in all_refs:
         if ref and pdf_paths:
             ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
             if ref_text:
-                # Add figure image notation if available
-                if figure_images and ref in figure_images:
-                    ref_text = f"[FIGURE IMAGE EXTRACTED: {ref}]\n\n{ref_text}"
                 extraction_texts.append(f"\n=== Data from {ref} ===\n{ref_text}")
+                # Extract figure images for this reference (crop page around figure)
+                try:
+                    fig_base64 = extract_figure_image(pdf_paths, ref)
+                    if fig_base64:
+                        figure_images[ref] = fig_base64
+                        log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)
+                        # Save the figure image to debug folder
+                        if debug_dir:
+                            debug_path = Path(debug_dir)
+                            debug_path.mkdir(parents=True, exist_ok=True)
+                            # Clean ref for filename
+                            safe_ref = re.sub(r'[^\w\s-]', '', ref).strip().replace(' ', '_')
+                            image_file = debug_path / f"figure_{safe_ref}_{campaign_id}.png"
+                            # Decode and save the image
+                            import base64
+                            with open(image_file, 'wb') as f:
+                                f.write(base64.b64decode(fig_base64))
+                            log.info("Campaign %s - saved figure image to %s", campaign_id, image_file)
+                except Exception as e:
+                    log.warning("Campaign %s - failed to extract figure for %s: %s", campaign_id, ref, e)
     if not extraction_texts:
         extraction_texts = [text[:50_000]]
     extraction_text = "\n\n".join(extraction_texts)
-    prompt = _SUBSTRATE_SCOPE_PROMPT.format(extraction_hints=extraction_hints)
-    prompt += "\n\nTEXT:\n" + extraction_text
-    # Prepare multimodal content with images
-    content_parts = [prompt]
+    # Simple model reaction context
+    model_reactions_context = """
+CRITICAL: Substrate scope reactions are those that test DIFFERENT substrates than the model reactions used for evolution.
+Model reactions are used to evolve/optimize enzymes. Substrate scope reactions test evolved enzymes on different substrates.
+"""
-    # Add figure images to the prompt
-    if figure_images:
-        import PIL.Image
-        import io
-        import base64
-        for fig_ref, fig_base64 in figure_images.items():
-            try:
-                # Convert base64 to PIL Image
-                img_bytes = base64.b64decode(fig_base64)
-                image = PIL.Image.open(io.BytesIO(img_bytes))
-                content_parts.append(f"\n[Figure: {fig_ref}]")
-                content_parts.append(image)
-                log.info("Added figure %s to multimodal prompt", fig_ref)
-            except Exception as e:
-                log.warning("Failed to add figure %s: %s", fig_ref, e)
+    # Create campaign-specific prompt
+    campaign_prompt = f"""
+You are an expert reader of biocatalysis manuscripts.
+Extract ALL substrate scope reaction data specifically for campaign: "{campaign_id}".
+CAMPAIGN CONTEXT:
+- Campaign ID: {campaign_id}
+- Target enzymes: {', '.join(enzyme_ids)}
+{model_reactions_context}
+IMPORTANT INSTRUCTIONS:
+1. Focus ONLY on substrate scope data for the specified campaign and enzymes
+2. Extract reactions involving enzymes: {', '.join(enzyme_ids)}
+3. CRITICAL: Distinguish substrate scope studies from model reactions used for evolution
+   - Model reactions are those used to evolve/optimize the enzymes (listed above)
+   - Substrate scope reactions test the evolved enzymes on DIFFERENT substrates
+   - DO NOT include model reactions in substrate scope data
+4. Not all campaigns have substrate scope data - if no substrate scope data exists for this campaign, return an empty array
+5. Include all relevant reaction performance data (yield, ee, ttn, etc.)
+{extraction_hints}
+Return your analysis as JSON in this format:
+{{
+  "substrate_scope_data": [
+    {{
+      "enzyme_id": "enzyme identifier",
+      "substrate_ids": ["substrate identifiers"],
+      "product_ids": ["product identifiers"],
+      "substrate_names": ["substrate names"],
+      "product_names": ["product names"],
+      "yield_percent": number or null,
+      "ee": number or null,
+      "ttn": number or null,
+      "temperature": "temperature" or null,
+      "ph": "pH" or null,
+      "buffer": "buffer" or null,
+      "substrate_concentration": "concentration" or null,
+      "data_location": "where this data was found",
+      "campaign_id": "{campaign_id}",
+      "is_substrate_scope": true,
+      "model_reaction_excluded": "reason why this is not a model reaction"
+    }}
+  ]
+}}
+Important: Only return TRUE substrate scope data (not model reactions) for the specified campaign. If no substrate scope data exists for this campaign, return {{"substrate_scope_data": []}}.
+"""
     try:
-        # Use multimodal content if we have images
-        if len(content_parts) > 1:
-            # Log multimodal API call
-            log.info("=== GEMINI MULTIMODAL API CALL: SUBSTRATE_SCOPE_WITH_FIGURES ===")
-            log.info("Text prompt length: %d characters", len(prompt))
-            log.info("Number of images: %d", len(content_parts) - 1)
-            log.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
+        # Use multimodal extraction if we have figure images
+        if figure_images:
+            log.info("Campaign %s - using multimodal extraction with %d figure images", campaign_id, len(figure_images))
+            # Prepare multimodal content
+            import PIL.Image
+            import io
+            import base64
+            content_parts = [campaign_prompt + "\n\nTEXT:\n" + extraction_text]
+            for fig_ref, fig_base64 in figure_images.items():
+                try:
+                    # Convert base64 to PIL Image
+                    img_bytes = base64.b64decode(fig_base64)
+                    image = PIL.Image.open(io.BytesIO(img_bytes))
+                    content_parts.append(f"\n[Figure: {fig_ref}]")
+                    content_parts.append(image)
+                    log.info("Campaign %s - added figure %s to multimodal prompt", campaign_id, fig_ref)
+                except Exception as e:
+                    log.warning("Campaign %s - failed to add figure %s: %s", campaign_id, fig_ref, e)
-            # Save prompt and image info to debug directory
+            # Save debug info
             if debug_dir:
                 debug_path = Path(debug_dir)
                 debug_path.mkdir(parents=True, exist_ok=True)
-                prompt_file = debug_path / f"substrate_scope_multimodal_prompt_{int(time.time())}.txt"
+                prompt_file = debug_path / f"substrate_scope_{campaign_id}_multimodal_prompt.txt"
-                # Build prompt info including image references
-                prompt_info = f"=== PROMPT FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n"
+                prompt_info = f"=== CAMPAIGN {campaign_id} MULTIMODAL PROMPT ===\n"
                 prompt_info += f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
-                prompt_info += f"Text length: {len(prompt)} characters\n"
-                prompt_info += f"Images included: {len(content_parts) - 1}\n"
+                prompt_info += f"Text length: {len(extraction_text)} characters\n"
+                prompt_info += f"Images included: {len(figure_images)}\n"
                 for fig_ref in figure_images.keys():
                     prompt_info += f"  - {fig_ref}\n"
                 prompt_info += "="*80 + "\n\n"
-                prompt_info += prompt
+                prompt_info += campaign_prompt + "\n\nTEXT:\n" + extraction_text
-                _dump(prompt_info, prompt_file)
-                log.info("Full prompt saved to: %s", prompt_file)
+                with open(prompt_file, 'w') as f:
+                    f.write(prompt_info)
+                log.info("Campaign %s - prompt saved to: %s", campaign_id, prompt_file)
-            log.info("Calling Gemini Multimodal API...")
+            # Call multimodal API
             response = model.generate_content(content_parts)
-            raw_text = _extract_text(response).strip()
-            # Log and save response
-            log.info("Gemini multimodal response length: %d characters", len(raw_text))
-            log.info("First 500 chars of response:\n%s\n...(truncated)", raw_text[:500])
+            raw_text = response.text.strip()
+            # Save response
             if debug_dir:
-                debug_path = Path(debug_dir)
-                response_file = debug_path / f"substrate_scope_multimodal_response_{int(time.time())}.txt"
+                response_file = debug_path / f"substrate_scope_{campaign_id}_multimodal_response.txt"
                 with open(response_file, 'w') as f:
-                    f.write(f"=== RESPONSE FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n")
+                    f.write(f"=== CAMPAIGN {campaign_id} MULTIMODAL RESPONSE ===\n")
                     f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
                     f.write(f"Length: {len(raw_text)} characters\n")
                     f.write("="*80 + "\n\n")
                     f.write(raw_text)
-                log.info("Full response saved to: %s", response_file)
+                log.info("Campaign %s - response saved to: %s", campaign_id, response_file)
-            # Parse JSON from response
+            # Parse JSON
             import json
             data = json.loads(raw_text.strip('```json').strip('```').strip())
         else:
+            log.info("Campaign %s - using text-only extraction", campaign_id)
             data = generate_json_with_retry(
                 model,
-                prompt,
+                campaign_prompt + "\n\nTEXT:\n" + extraction_text,
                 debug_dir=debug_dir,
-                tag="substrate_scope",
+                tag=f"substrate_scope_{campaign_id}",
             )
         scope_data = data.get("substrate_scope_data", [])
-        log.info("Extracted %d substrate scope entries", len(scope_data))
+        # Add campaign_id to each entry if not present
+        for entry in scope_data:
+            if "campaign_id" not in entry:
+                entry["campaign_id"] = campaign_id
+        log.info("Campaign %s - extracted %d substrate scope entries", campaign_id, len(scope_data))
         return scope_data
     except Exception as exc:
-        log.error("Failed to extract substrate scope data: %s", exc)
+        log.error("Failed to extract substrate scope data for campaign %s: %s", campaign_id, exc)
         return []
 def _extract_single_reaction(
     text: str,
     model,
@@ -1911,7 +1986,7 @@ def _extract_single_reaction(
         log.error("Failed to extract reaction %s-%s: %s", enzyme_id, substrate_name, exc)
         return None
-def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping]) -> List[ScopeEntry]:
+def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping], campaign_id: Optional[str] = None) -> List[ScopeEntry]:
     """Convert raw JSON to ScopeEntry objects with IUPAC enhancement."""
     entries: List[ScopeEntry] = []
@@ -2020,6 +2095,7 @@ def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, Compound
                 conditions=conditions,
                 data_location=item.get("data_location", ""),
                 data_source_type={"all": "text/figure"},
+                campaign_id=campaign_id or item.get("campaign_id", ""),
                 notes=item.get("notes", "")
             )
@@ -2050,7 +2126,10 @@ def get_substrate_scope(
     5. Extract individual reactions with context
     """
     # Step 1: Find locations using captions
-    locations = identify_scope_locations(caption_text, model, debug_dir=debug_dir)
+    # For backward compatibility, use campaign-specific function with generic parameters
+    locations = identify_scope_locations_for_campaign(
+        caption_text, model, "general", ["all"], debug_dir=debug_dir
+    )
     if locations:
         location_summary = []
         for loc in locations[:3]:
@@ -2111,10 +2190,13 @@ def get_substrate_scope(
                 log.warning("Failed to extract %s image for %s", location_type, figure_ref)
     # Extract all substrate scope data in one call
-    raw_entries = extract_all_substrate_scope_data(
+    # Note: This function is now deprecated in favor of campaign-specific extraction
+    # For backward compatibility, we'll use a generic campaign approach
+    raw_entries = extract_substrate_scope_entries_for_campaign(
         full_text, model, locations,
+        campaign_id="general",
+        enzyme_ids=["all"],
         pdf_paths=pdf_paths,
-        figure_images=figure_images,
         debug_dir=debug_dir
     )
@@ -2158,6 +2240,96 @@ def get_substrate_scope(
     return entries
+def get_substrate_scope_for_campaign(
+    caption_text: str,
+    full_text: str,
+    model,
+    *,
+    campaign_id: str,
+    enzyme_ids: List[str],
+    pdf_paths: Optional[List[Path]] = None,
+    debug_dir: str | Path | None = None,
+) -> List[ScopeEntry]:
+    """
+    Campaign-specific substrate scope extraction.
+    Like get_substrate_scope but focuses on a specific campaign and its enzymes.
+    Tells Gemini about the specific campaign and that it's okay to return null if
+    no substrate scope data exists for this campaign.
+    """
+    log.info("Starting campaign-specific substrate scope extraction for: %s", campaign_id)
+    log.info("Target enzymes: %s", enzyme_ids)
+    # Step 1: Find locations using captions with campaign context
+    locations = identify_scope_locations_for_campaign(
+        caption_text, model, campaign_id, enzyme_ids, debug_dir=debug_dir
+    )
+    if not locations:
+        log.info("No substrate scope locations identified for campaign %s", campaign_id)
+        return []
+    location_summary = []
+    for loc in locations[:3]:
+        location_summary.append(
+            f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, "
+            f"confidence: {loc.get('confidence', 0)})"
+        )
+    log.info("Campaign %s - identified %d substrate scope locations: %s",
+             campaign_id, len(locations), ", ".join(location_summary))
+    # Step 2: Identify IUPAC sections from SI TOC (reuse existing logic)
+    iupac_sections = identify_iupac_sections(caption_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
+    log.info("Campaign %s - identified %d IUPAC sections", campaign_id, len(iupac_sections))
+    # Step 3: Extract raw entries with campaign context
+    raw_entries = extract_substrate_scope_entries_for_campaign(
+        full_text, model, locations, campaign_id, enzyme_ids,
+        pdf_paths=pdf_paths, debug_dir=debug_dir
+    )
+    if not raw_entries:
+        log.info("No substrate scope entries extracted for campaign %s", campaign_id)
+        return []
+    log.info("Campaign %s - extracted %d raw substrate scope entries", campaign_id, len(raw_entries))
+    # Step 4: Extract compound mappings (reuse existing logic)
+    figure_images = []
+    if pdf_paths:
+        for pdf_path in pdf_paths:
+            try:
+                figure_images.extend(extract_figure_images(pdf_path))
+            except Exception as e:
+                log.warning("Failed to extract figure images from %s: %s", pdf_path, e)
+    # Collect all compound IDs from raw entries
+    all_compound_ids = set()
+    for entry in raw_entries:
+        substrate_ids = entry.get("substrate_ids", [])
+        product_ids = entry.get("product_ids", [])
+        for sid in substrate_ids:
+            all_compound_ids.add(str(sid))
+        for pid in product_ids:
+            all_compound_ids.add(str(pid))
+    log.info("Campaign %s - found %d unique compound IDs to map", campaign_id, len(all_compound_ids))
+    # Extract compound mappings (reuse existing function)
+    compound_mappings = extract_compound_mappings(full_text, model,
+                                                 pdf_paths=pdf_paths,
+                                                 iupac_sections=iupac_sections,
+                                                 compound_ids=list(all_compound_ids),
+                                                 primary_locations=locations,
+                                                 debug_dir=debug_dir)
+    # Step 5: Parse all entries with compound mappings
+    entries = _parse_scope_entries(raw_entries, compound_mappings, campaign_id)
+    log.info("Campaign %s - successfully parsed %d substrate scope entries", campaign_id, len(entries))
+    return entries
 # === 7. VALIDATION & MERGE ===
 """Validation, duplicate detection, and merging with lineage data."""
@@ -2353,6 +2525,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
             'parent_enzyme_id': entry.parent_id or '',
             'mutations': entry.mutations or '',
             'generation': entry.generation if entry.generation is not None else '',
+            'campaign_id': entry.campaign_id or '',
             'protein_sequence': entry.aa_seq or '',
             'nucleotide_sequence': entry.dna_seq or '',
             'sequence_confidence': str(entry.confidence) if entry.confidence is not None else '',
@@ -2385,7 +2558,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
     # Define column order
     column_order = [
-        'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation',
+        'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation', 'campaign_id',
         'protein_sequence', 'nucleotide_sequence', 'sequence_confidence', 'flag',
         'substrate_list', 'substrate_iupac_list',
         'product_list', 'product_iupac_list',
@@ -2447,23 +2620,83 @@ def run_pipeline(
     # 2. Connect to Gemini -----------------------------------------------------
     model = get_model()
-    # 3. Extract substrate scope -----------------------------------------------
-    entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
+    # 3. Check for campaign-based extraction -----------------------------------
+    all_entries = []
+    if lineage_csv:
+        import pandas as pd
+        lineage_df = pd.read_csv(lineage_csv)
+        # Check if we have campaign_id column - if so, process each campaign separately
+        if 'campaign_id' in lineage_df.columns:
+            campaigns = lineage_df['campaign_id'].unique()
+            log.info("Detected %d campaigns in lineage data - processing each separately", len(campaigns))
+            log.info("Campaigns: %s", campaigns.tolist())
+            # Simple campaign context for model reaction awareness
+            campaigns_context_text = f"All campaigns: {campaigns.tolist()}"
+            identify_scope_locations_for_campaign._all_campaigns_context = campaigns_context_text
+            extract_substrate_scope_entries_for_campaign._all_campaigns_context = campaigns_context_text
+            for campaign_id in campaigns:
+                log.info("\n" + "="*60)
+                log.info("Processing campaign: %s", campaign_id)
+                log.info("="*60)
+                # Get enzymes for this campaign
+                campaign_enzymes = lineage_df[lineage_df['campaign_id'] == campaign_id]
+                if 'enzyme_id' in campaign_enzymes.columns:
+                    enzyme_ids = campaign_enzymes['enzyme_id'].tolist()
+                elif 'enzyme' in campaign_enzymes.columns:
+                    enzyme_ids = campaign_enzymes['enzyme'].tolist()
+                elif 'variant_id' in campaign_enzymes.columns:
+                    enzyme_ids = campaign_enzymes['variant_id'].tolist()
+                else:
+                    raise ValueError("No enzyme ID column found in lineage data")
+                log.info("Campaign %s has %d enzymes: %s", campaign_id, len(enzyme_ids), enzyme_ids)
+                # Create campaign-specific debug dir
+                campaign_debug_dir = Path(debug_dir) / campaign_id if debug_dir else None
+                # Extract substrate scope for this campaign
+                campaign_entries = get_substrate_scope_for_campaign(
+                    caption_text, full_text, model,
+                    campaign_id=campaign_id,
+                    enzyme_ids=enzyme_ids,
+                    pdf_paths=pdf_paths,
+                    debug_dir=campaign_debug_dir
+                )
+                if campaign_entries:
+                    log.info("Extracted %d substrate scope entries for campaign %s", len(campaign_entries), campaign_id)
+                    all_entries.extend(campaign_entries)
+                else:
+                    log.info("No substrate scope data found for campaign %s", campaign_id)
+        else:
+            # Original single extraction
+            entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
+            all_entries = entries
+    else:
+        # No lineage data - single extraction
+        entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
+        all_entries = entries
-    if not entries:
-        raise RuntimeError("Pipeline aborted: failed to extract any substrate scope data")
+    if not all_entries:
+        log.warning("No substrate scope data extracted from any campaign")
+        all_entries = []  # Allow empty results
     # 4. Merge with lineage if available ---------------------------------------
-    if lineage_csv:
-        entries = merge_with_lineage(entries, Path(lineage_csv), model)
+    if lineage_csv and all_entries:
+        all_entries = merge_with_lineage(all_entries, Path(lineage_csv), model)
     # 5. Validate entries ------------------------------------------------------
-    warnings = validate_scope_entries(entries)
+    warnings = validate_scope_entries(all_entries)
     if warnings:
         log.warning("Found %d validation warnings", len(warnings))
     # 6. Convert to DataFrame --------------------------------------------------
-    df_final = _entries_to_dataframe(entries)
+    df_final = _entries_to_dataframe(all_entries)
     # 7. Write CSV if requested ------------------------------------------------
     if output_csv:

debase 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

debase 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl