PyPI - debase - Versions diffs - 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

debase 0.4.2py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +656 -27
debase/enzyme_lineage_extractor.py +1077 -109
debase/lineage_format.py +221 -12
debase/reaction_info_extractor.py +133 -23
debase/substrate_scope_extractor.py +49 -2
debase/wrapper.py +155 -151
debase-0.4.4.dist-info/METADATA +121 -0
debase-0.4.4.dist-info/RECORD +16 -0
debase-0.4.2.dist-info/METADATA +0 -296
debase-0.4.2.dist-info/RECORD +0 -16
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/WHEEL +0 -0
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/entry_points.txt +0 -0
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/top_level.txt +0 -0

debase/cleanup_sequence.py CHANGED Viewed

@@ -11,6 +11,7 @@ Usage:
 import argparse
 import logging
+import os
 import re
 import sys
 from dataclasses import dataclass, field
@@ -19,11 +20,20 @@ from typing import Dict, List, Optional, Set, Tuple, Union
 import pandas as pd
+try:
+    import google.generativeai as genai  # type: ignore
+    GEMINI_OK = True
+except ImportError:  # pragma: no cover
+    GEMINI_OK = False
 # === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
 VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*")  # Include * for stop codons
+# Gemini API configuration
+GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
 # Configure module logger
 log = logging.getLogger(__name__)
@@ -193,12 +203,17 @@ class SequenceManipulator:
         return 0 if zero_matches >= one_matches else 1
     @classmethod
-    def apply_mutations(cls, parent_seq: str, mutation_str: str) -> str:
-        """Apply mutations to a parent sequence."""
+    def apply_mutations(cls, parent_seq: str, mutation_str: str) -> Tuple[str, bool]:
+        """Apply mutations to a parent sequence.
+        Returns:
+            Tuple[str, bool]: (resulting_sequence, all_mutations_applied_successfully)
+        """
         if not parent_seq:
-            return ""
+            return "", True
         seq = list(parent_seq)
+        all_mutations_successful = True
         # Apply point mutations
         mutations = MutationParser.parse_mutations(mutation_str)
@@ -207,19 +222,26 @@ class SequenceManipulator:
             for mut in mutations:
                 idx = mut.position - idx_offset
+                mutation_applied = False
                 # Try primary index
                 if 0 <= idx < len(seq) and seq[idx].upper() == mut.original.upper():
                     seq[idx] = mut.replacement
+                    mutation_applied = True
                 else:
                     # Try alternate index
                     alt_idx = mut.position - (1 - idx_offset)
                     if 0 <= alt_idx < len(seq) and seq[alt_idx].upper() == mut.original.upper():
                         seq[alt_idx] = mut.replacement
-                    else:
-                        log.warning(
-                            f"Mutation {mut} does not match parent sequence at "
-                            f"position {mut.position} (tried both 0- and 1-based indexing)"
-                        )
+                        mutation_applied = True
+                if not mutation_applied:
+                    log.error(
+                        f"MUTATION MISMATCH: {mut} does not match parent sequence at "
+                        f"position {mut.position} (tried both 0- and 1-based indexing). "
+                        f"Parent has {seq[idx] if 0 <= idx < len(seq) else 'out-of-bounds'} at position {mut.position}"
+                    )
+                    all_mutations_successful = False
         # Apply complex C-terminal mutations
         complex_mut = MutationParser.parse_complex_c_terminal(mutation_str)
@@ -242,12 +264,13 @@ class SequenceManipulator:
                 if complex_mut.extension_seq:
                     seq.extend(list(complex_mut.extension_seq))
             else:
-                log.warning(
-                    f"Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
+                log.error(
+                    f"COMPLEX MUTATION MISMATCH: Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
                     f"{complex_mut.end_pos} for sequence of length {len(seq)}"
                 )
+                all_mutations_successful = False
-        return "".join(seq)
+        return "".join(seq), all_mutations_successful
     @classmethod
     def reverse_mutations(cls, child_seq: str, mutation_str: str) -> str:
@@ -390,10 +413,11 @@ class LineageNavigator:
 class SequenceGenerator:
     """Main class for generating protein sequences from mutations."""
-    def __init__(self, df: pd.DataFrame):
+    def __init__(self, df: pd.DataFrame, strict_mutation_validation: bool = True):
         self.df = df
         self.navigator = LineageNavigator(df)
         self.manipulator = SequenceManipulator()
+        self.strict_mutation_validation = strict_mutation_validation
         self._update_ground_truths()
     def _update_ground_truths(self) -> None:
@@ -464,19 +488,62 @@ class SequenceGenerator:
         parent_id: str
     ) -> Optional[SequenceGenerationResult]:
         """Generate sequence by applying mutations to parent."""
-        parent_row = self.df[self.df["enzyme_id"] == parent_id].iloc[0]
-        parent_seq = parent_row.get("protein_sequence", "")
-        if not parent_seq:
+        # Get the variant to find its campaign
+        variant_rows = self.df[self.df["enzyme_id"] == variant_id]
+        if variant_rows.empty:
             return None
-        variant_row = self.df[self.df["enzyme_id"] == variant_id].iloc[0]
+        variant_row = variant_rows.iloc[0]
+        variant_campaign = variant_row.get("campaign_id", "")
         mutations = variant_row.get("mutations", "")
         if not mutations:
             return None
-        sequence = self.manipulator.apply_mutations(parent_seq, mutations)
+        # Find parent in the same campaign first
+        parent_rows = self.df[
+            (self.df["enzyme_id"] == parent_id) &
+            (self.df["campaign_id"] == variant_campaign)
+        ]
+        # If not found in same campaign, fall back to any parent with that ID
+        if parent_rows.empty:
+            parent_rows = self.df[self.df["enzyme_id"] == parent_id]
+            if not parent_rows.empty:
+                log.warning(f"Parent {parent_id} not found in same campaign {variant_campaign} for variant {variant_id}, using parent from different campaign")
+        if parent_rows.empty:
+            log.error(f"Parent {parent_id} not found for variant {variant_id}")
+            return None
+        parent_row = parent_rows.iloc[0]
+        parent_seq = parent_row.get("protein_sequence", "")
+        parent_campaign = parent_row.get("campaign_id", "")
+        if not parent_seq:
+            return None
+        # Log which parent sequence is being used
+        if parent_campaign != variant_campaign:
+            log.info(f"Using parent {parent_id} from campaign {parent_campaign} for variant {variant_id} in campaign {variant_campaign}")
+        else:
+            log.info(f"Using parent {parent_id} from same campaign {variant_campaign} for variant {variant_id}")
+        sequence, mutations_successful = self.manipulator.apply_mutations(parent_seq, mutations)
+        if not mutations_successful:
+            # Check if this might be an exact match case (mutations already present in parent)
+            # This happens when an enzyme from another campaign is identified as both parent and exact match
+            if parent_id == variant_id or (mutations and parent_seq == sequence):
+                log.info(f"Detected exact match scenario for {variant_id} - using parent sequence directly")
+                sequence = parent_seq
+                mutations_successful = True
+            elif self.strict_mutation_validation:
+                log.error(f"STRICT MODE: Failed to apply mutations for {variant_id}: mutation mismatch detected. Not populating sequence to prevent incorrect data.")
+                return None
+            else:
+                log.warning(f"Mutation mismatch for {variant_id}, but proceeding with generated sequence (strict_mutation_validation=False)")
+                # Continue with the sequence even if mutations failed
         return SequenceGenerationResult(
             sequence=sequence,
@@ -538,10 +605,14 @@ class SequenceGenerator:
         # Generate based on direction
         if direction == "up" and parent_id and mutations:
-            if gt_id == parent_id:
-                return self.generate_from_parent(variant_id, parent_id)
-            else:
-                # Non-direct ancestor - less reliable
+            # Always try the declared parent first
+            result = self.generate_from_parent(variant_id, parent_id)
+            if result:
+                return result
+            # If declared parent fails, try the ground truth (if different)
+            if gt_id != parent_id:
+                log.info(f"Declared parent {parent_id} failed for {variant_id}, trying ground truth {gt_id}")
                 result = self.generate_from_parent(variant_id, gt_id)
                 if result:
                     result.confidence = 0.7
@@ -565,14 +636,501 @@ class SequenceGenerator:
         return None
-# === 7. MAIN PROCESSOR === ---------------------------------------------------
+# === 7. GEMINI PARENT IDENTIFICATION === ------------------------------------
+def identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
+    """Use Gemini API to identify parent enzymes for entries with missing parent information."""
+    if not GEMINI_OK:
+        log.warning("Gemini API not available (missing google.generativeai). Skipping parent identification.")
+        return df
+    if not GEMINI_API_KEY:
+        log.warning("GEMINI_API_KEY not set. Skipping parent identification.")
+        return df
+    try:
+        genai.configure(api_key=GEMINI_API_KEY)
+        model = genai.GenerativeModel('gemini-1.5-flash')
+    except Exception as e:
+        log.warning(f"Failed to configure Gemini API: {e}. Skipping parent identification.")
+        return df
+    # Find entries with empty sequences but missing parent information
+    entries_needing_parents = []
+    for idx, row in df.iterrows():
+        protein_sequence = str(row.get("protein_sequence", "")).strip()
+        parent_id = str(row.get("parent_enzyme_id", "")).strip()
+        # Only process entries that have empty sequences AND no parent info
+        if (not protein_sequence or protein_sequence.lower() in ["nan", "none", ""]) and (not parent_id or parent_id.lower() in ["nan", "none", ""]):
+            enzyme_id = str(row.get("enzyme_id", ""))
+            campaign_id = str(row.get("campaign_id", ""))
+            generation = str(row.get("generation", ""))
+            entries_needing_parents.append({
+                "idx": idx,
+                "enzyme_id": enzyme_id,
+                "campaign_id": campaign_id,
+                "generation": generation
+            })
+    if not entries_needing_parents:
+        log.info("No entries need parent identification from Gemini")
+        return df
+    log.info(f"Found {len(entries_needing_parents)} entries needing parent identification. Querying Gemini...")
+    # Create a lookup of all available enzyme IDs for context
+    available_enzymes = {}
+    for idx, row in df.iterrows():
+        enzyme_id = str(row.get("enzyme_id", ""))
+        campaign_id = str(row.get("campaign_id", ""))
+        protein_sequence = str(row.get("protein_sequence", "")).strip()
+        generation = str(row.get("generation", ""))
+        if enzyme_id and enzyme_id.lower() != "nan":
+            available_enzymes[enzyme_id] = {
+                "campaign_id": campaign_id,
+                "has_sequence": bool(protein_sequence and protein_sequence.lower() not in ["nan", "none", ""]),
+                "generation": generation
+            }
+    identified_count = 0
+    for entry in entries_needing_parents:
+        enzyme_id = entry["enzyme_id"]
+        campaign_id = entry["campaign_id"]
+        generation = entry["generation"]
+        # Create context for Gemini
+        context_info = []
+        context_info.append(f"Enzyme ID: {enzyme_id}")
+        context_info.append(f"Campaign ID: {campaign_id}")
+        if generation:
+            context_info.append(f"Generation: {generation}")
+        # Add available enzymes from the same campaign for context
+        campaign_enzymes = []
+        for enz_id, enz_data in available_enzymes.items():
+            if enz_data["campaign_id"] == campaign_id:
+                status = "with sequence" if enz_data["has_sequence"] else "without sequence"
+                gen_info = f"(gen {enz_data['generation']})" if enz_data["generation"] else ""
+                campaign_enzymes.append(f"  - {enz_id} {status} {gen_info}")
+        if campaign_enzymes:
+            context_info.append("Available enzymes in same campaign:")
+            context_info.extend(campaign_enzymes[:10])  # Limit to first 10 for context
+        context_text = "\n".join(context_info)
+        prompt = f"""
+Based on the enzyme information provided, can you identify the parent enzyme for this enzyme?
+{context_text}
+This enzyme currently has no sequence data and no parent information. Based on the enzyme ID and the available enzymes in the same campaign, can you identify which enzyme is likely the parent?
+Please provide your response in this format:
+Parent: [parent_enzyme_id or "Unknown"]
+If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
+"""
+        try:
+            response = model.generate_content(prompt)
+            response_text = response.text.strip()
+            # Parse the response
+            parent_match = re.search(r'Parent:\s*([^\n]+)', response_text)
+            if parent_match:
+                parent = parent_match.group(1).strip()
+                if parent and parent != "Unknown" and parent != "No parent identified":
+                    # Verify the parent exists in our available enzymes
+                    if parent in available_enzymes:
+                        df.at[entry["idx"], "parent_enzyme_id"] = parent
+                        identified_count += 1
+                        log.info(f"Identified parent for {enzyme_id}: {parent}")
+                    else:
+                        log.warning(f"Gemini suggested parent {parent} for {enzyme_id}, but it's not in available enzymes")
+        except Exception as e:
+            log.warning(f"Failed to identify parent for {enzyme_id} from Gemini: {e}")
+            continue
+    if identified_count > 0:
+        log.info(f"Successfully identified {identified_count} parent enzymes using Gemini API")
+    else:
+        log.info("No parent enzymes were identified using Gemini API")
+    return df
+# === 8. SEQUENCE SOURCE IDENTIFICATION === -----------------------------------
+def identify_sequence_sources_with_gemini(df: pd.DataFrame, debug_dir: Optional[Path] = None) -> pd.DataFrame:
+    """Use Gemini API to identify which parent sequences to use for entries with missing sequences."""
+    if not GEMINI_OK:
+        log.warning("Gemini API not available (missing google.generativeai). Skipping sequence source identification.")
+        return df
+    if not GEMINI_API_KEY:
+        log.warning("GEMINI_API_KEY not set. Skipping sequence source identification.")
+        return df
+    try:
+        genai.configure(api_key=GEMINI_API_KEY)
+        model = genai.GenerativeModel('gemini-1.5-flash')
+    except Exception as e:
+        log.warning(f"Failed to configure Gemini API: {e}. Skipping sequence source identification.")
+        return df
+    # Group by campaign to process each campaign separately
+    campaigns = df['campaign_id'].unique()
+    for campaign_id in campaigns:
+        if pd.isna(campaign_id):
+            campaign_mask = df['campaign_id'].isna()
+            campaign_id_str = "unknown"
+        else:
+            campaign_mask = df['campaign_id'] == campaign_id
+            campaign_id_str = str(campaign_id)
+        campaign_df = df[campaign_mask]
+        # Find entries with empty sequences in this campaign
+        empty_seq_entries = []
+        available_seq_entries = []
+        for idx, row in campaign_df.iterrows():
+            enzyme_id = str(row.get("enzyme_id", ""))
+            protein_sequence = str(row.get("protein_sequence", "")).strip()
+            parent_id = str(row.get("parent_enzyme_id", "")).strip()
+            mutations = str(row.get("mutations", "")).strip()
+            generation = str(row.get("generation", ""))
+            if not protein_sequence or protein_sequence.lower() in ["nan", "none", ""]:
+                empty_seq_entries.append({
+                    "idx": idx,
+                    "enzyme_id": enzyme_id,
+                    "parent_id": parent_id if parent_id != "nan" else None,
+                    "mutations": mutations if mutations != "nan" else None,
+                    "generation": generation
+                })
+            else:
+                available_seq_entries.append({
+                    "enzyme_id": enzyme_id,
+                    "generation": generation,
+                    "seq_length": len(protein_sequence)
+                })
+        # Skip if no empty sequences
+        if not empty_seq_entries:
+            continue
+        # Check if this is a partially empty situation (some have sequences, some don't)
+        total_entries = len(campaign_df)
+        empty_count = len(empty_seq_entries)
+        log.info(f"Campaign {campaign_id_str}: {empty_count}/{total_entries} entries have empty sequences")
+        if empty_count == total_entries:
+            # All sequences are empty - try to find cross-campaign relationships
+            log.info(f"Campaign {campaign_id_str}: All sequences are empty ({empty_count}/{total_entries}). "
+                     f"Searching for cross-campaign parent relationships...")
+            # Get all enzymes with sequences from OTHER campaigns
+            other_campaigns_with_seqs = []
+            for other_campaign in campaigns:
+                if other_campaign == campaign_id or pd.isna(other_campaign):
+                    continue
+                other_mask = df['campaign_id'] == other_campaign
+                other_df = df[other_mask]
+                for idx, row in other_df.iterrows():
+                    protein_sequence = str(row.get("protein_sequence", "")).strip()
+                    if protein_sequence and protein_sequence.lower() not in ["nan", "none", ""]:
+                        enzyme_id = str(row.get("enzyme_id", ""))
+                        generation = str(row.get("generation", ""))
+                        other_campaigns_with_seqs.append({
+                            "enzyme_id": enzyme_id,
+                            "campaign_id": str(other_campaign),
+                            "generation": generation,
+                            "seq_length": len(protein_sequence)
+                        })
+            if not other_campaigns_with_seqs:
+                log.info(f"Campaign {campaign_id_str}: No sequences found in other campaigns to use as cross-campaign parents")
+                continue
+            # Create context for cross-campaign analysis
+            context_lines = []
+            context_lines.append(f"Empty Campaign: {campaign_id_str} (all {empty_count} enzymes need sequences)")
+            context_lines.append(f"\nEnzymes in empty campaign:")
+            for entry in empty_seq_entries[:10]:  # Limit for context
+                parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ", no parent info"
+                mut_info = f", mutations: {entry['mutations'][:50]}..." if entry['mutations'] and len(entry['mutations']) > 50 else f", mutations: {entry['mutations']}" if entry['mutations'] else ""
+                context_lines.append(f"  - {entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
+            context_lines.append(f"\nEnzymes with sequences from OTHER campaigns ({len(other_campaigns_with_seqs)}):")
+            for entry in other_campaigns_with_seqs[:15]:  # Limit for context
+                # Get the actual sequence for this enzyme
+                enzyme_rows = df[df['enzyme_id'] == entry['enzyme_id']]
+                if not enzyme_rows.empty:
+                    sequence = str(enzyme_rows.iloc[0]['protein_sequence'])
+                    context_lines.append(f"  - {entry['enzyme_id']} from {entry['campaign_id']} (gen {entry['generation']}, sequence: {sequence})")
+                else:
+                    context_lines.append(f"  - {entry['enzyme_id']} from {entry['campaign_id']} (gen {entry['generation']}, {entry['seq_length']} aa)")
+            context_text = "\n".join(context_lines)
+            # Find ONE good cross-campaign seed to bootstrap this campaign
+            log.info(f"Campaign {campaign_id_str}: Looking for ONE cross-campaign seed to bootstrap sequences...")
+            # Create a prompt to find the BEST single seed
+            prompt = f"""
+Based on enzyme names, identify the SINGLE BEST seed enzyme from other campaigns to bootstrap the empty campaign.
+{context_text}
+From the enzymes in the EMPTY campaign, identify which ONE has the clearest match in OTHER campaigns.
+Prioritize:
+1. EXACT name matches (highest priority)
+2. Simplest parent relationships (e.g., an enzyme that differs by only 1-2 mutations)
+3. Earliest generation enzymes (lower generation numbers are better seeds)
+Return your response as a JSON dictionary with this exact format:
+{{
+  "seed_enzyme": {{
+    "target_enzyme_id": "the enzyme ID in the empty campaign",
+    "relationship_type": "EXACT_MATCH" or "BEST_PARENT",
+    "source": {{
+      "campaign_id": "the campaign ID",
+      "enzyme_id": "the enzyme ID WITHOUT campaign suffix"
+    }},
+    "confidence": 0.1 to 1.0,
+    "reason": "brief explanation of why this is the best seed"
+  }}
+}}
+Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
+"""
+            try:
+                # Save debug information if debug_dir is provided
+                if debug_dir:
+                    import time
+                    timestamp = time.strftime("%Y%m%d_%H%M%S")
+                    prompt_file = debug_dir / f"cross_campaign_seed_{campaign_id_str}_prompt_{timestamp}.txt"
+                    prompt_file.write_text(prompt)
+                response = model.generate_content(prompt)
+                response_text = response.text.strip()
+                # Save response if debug_dir is provided
+                if debug_dir:
+                    response_file = debug_dir / f"cross_campaign_seed_{campaign_id_str}_response_{timestamp}.txt"
+                    response_file.write_text(response_text)
+                # Parse the JSON response
+                import json
+                try:
+                    # Clean the response text if it contains markdown
+                    if '```json' in response_text:
+                        response_text = response_text.split('```json')[1].split('```')[0].strip()
+                    elif '```' in response_text:
+                        response_text = response_text.split('```')[1].split('```')[0].strip()
+                    seed_data = json.loads(response_text)
+                    seed_info = seed_data.get('seed_enzyme', {})
+                    if seed_info:
+                        target_enzyme_id = seed_info.get('target_enzyme_id', '')
+                        relationship_type = seed_info.get('relationship_type', '').upper()
+                        source_info = seed_info.get('source', {})
+                        source_enzyme_id = source_info.get('enzyme_id', '')
+                        source_campaign_id = source_info.get('campaign_id', '')
+                        confidence = float(seed_info.get('confidence', 0.5))
+                        reason = seed_info.get('reason', '')
+                        log.info(f"Campaign {campaign_id_str}: Found seed - {target_enzyme_id} from {source_enzyme_id} ({relationship_type}, confidence: {confidence})")
+                        log.info(f"Reason: {reason}")
+                        if source_enzyme_id:
+                            # Find the source enzyme's sequence in the dataframe
+                            source_rows = df[df['enzyme_id'] == source_enzyme_id]
+                            if source_rows.empty:
+                                log.warning(f"Source enzyme {source_enzyme_id} not found in dataframe")
+                            else:
+                                source_sequence = str(source_rows.iloc[0]['protein_sequence']).strip()
+                                if not source_sequence or source_sequence.lower() in ["nan", "none", ""]:
+                                    log.warning(f"Source enzyme {source_enzyme_id} has no sequence")
+                                else:
+                                    # Find the target enzyme in our empty list
+                                    seed_found = False
+                                    for entry in empty_seq_entries:
+                                        if entry['enzyme_id'] == target_enzyme_id:
+                                            if relationship_type == "EXACT_MATCH":
+                                                # Exact match - copy sequence directly
+                                                df.at[entry['idx'], 'protein_sequence'] = source_sequence
+                                                df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_exact"
+                                                log.info(f"Set seed sequence for {target_enzyme_id} from exact match {source_enzyme_id} (length: {len(source_sequence)})")
+                                                seed_found = True
+                                            elif relationship_type == "BEST_PARENT":
+                                                # Parent relationship - apply mutations to get the target sequence
+                                                target_mutations = entry.get('mutations', '').strip()
+                                                if target_mutations:
+                                                    # Apply mutations using SequenceManipulator
+                                                    manipulator = SequenceManipulator()
+                                                    mutated_sequence, success = manipulator.apply_mutations(source_sequence, target_mutations)
+                                                    if success:
+                                                        df.at[entry['idx'], 'protein_sequence'] = mutated_sequence
+                                                        df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent"
+                                                        log.info(f"Set seed sequence for {target_enzyme_id} by applying mutations {target_mutations} to parent {source_enzyme_id} (length: {len(mutated_sequence)})")
+                                                        seed_found = True
+                                                    else:
+                                                        log.warning(f"Failed to apply mutations {target_mutations} to parent {source_enzyme_id} for {target_enzyme_id}")
+                                                else:
+                                                    # No mutations - use parent sequence directly
+                                                    df.at[entry['idx'], 'protein_sequence'] = source_sequence
+                                                    df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent_no_mutations"
+                                                    log.info(f"Set seed sequence for {target_enzyme_id} from parent {source_enzyme_id} (no mutations, length: {len(source_sequence)})")
+                                                    seed_found = True
+                                            break
+                                    if seed_found:
+                                        log.info(f"Campaign {campaign_id_str}: Successfully set cross-campaign seed. Local processing will handle the rest.")
+                                    else:
+                                        log.warning(f"Campaign {campaign_id_str}: Could not find target enzyme {target_enzyme_id} in empty list")
+                except json.JSONDecodeError as e:
+                    log.warning(f"Failed to parse JSON response for cross-campaign seed: {e}")
+                    log.debug(f"Response text: {response_text}")
+            except Exception as e:
+                log.warning(f"Failed to identify cross-campaign seed for {campaign_id_str}: {e}")
+            continue
+        log.info(f"Campaign {campaign_id_str}: Found {empty_count}/{total_entries} entries with empty sequences. "
+                 f"Querying Gemini for sequence sources...")
+        # Create context for Gemini
+        context_lines = []
+        context_lines.append(f"Campaign: {campaign_id_str}")
+        context_lines.append(f"\nEnzymes WITH sequences ({len(available_seq_entries)}):")
+        for entry in available_seq_entries[:15]:  # Limit to first 15 for context
+            context_lines.append(f"  - {entry['enzyme_id']} (gen {entry['generation']}, {entry['seq_length']} aa)")
+        context_lines.append(f"\nEnzymes WITHOUT sequences ({len(empty_seq_entries)}):")
+        for entry in empty_seq_entries[:15]:  # Limit to first 15 for context
+            parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ", no parent info"
+            mut_info = f", mutations: {entry['mutations'][:50]}..." if entry['mutations'] and len(entry['mutations']) > 50 else f", mutations: {entry['mutations']}" if entry['mutations'] else ""
+            context_lines.append(f"  - {entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
+        context_text = "\n".join(context_lines)
+        # Process in batches if there are many empty sequences
+        batch_size = 10
+        identified_count = 0
+        for i in range(0, len(empty_seq_entries), batch_size):
+            batch = empty_seq_entries[i:i+batch_size]
+            # Create batch request
+            batch_request = []
+            for entry in batch:
+                parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ""
+                mut_info = f", mutations: {entry['mutations']}" if entry['mutations'] else ""
+                batch_request.append(f"{entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
+            prompt = f"""
+Based on the enzyme lineage information provided, identify which enzyme sequences should be used as the source to calculate sequences for the enzymes without sequences.
+{context_text}
+For each of these enzymes without sequences, identify which enzyme WITH a sequence should be used as the source:
+{chr(10).join(batch_request)}
+Instructions:
+1. If an enzyme has a parent_id and mutations, suggest using the parent's sequence
+2. If an enzyme has no parent_id, look for the most logical ancestor or related enzyme with a sequence
+3. Consider the generation numbers and enzyme naming patterns
+4. Only suggest enzymes that actually have sequences
+Please provide your response in this format:
+enzyme_id -> source_enzyme_id
+enzyme_id -> source_enzyme_id
+...
+If you cannot identify a suitable source, use "None" as the source_enzyme_id.
+"""
+            try:
+                # Save debug information if debug_dir is provided
+                if debug_dir:
+                    import time
+                    timestamp = time.strftime("%Y%m%d_%H%M%S")
+                    prompt_file = debug_dir / f"sequence_source_{campaign_id_str}_prompt_{timestamp}.txt"
+                    prompt_file.write_text(prompt)
+                response = model.generate_content(prompt)
+                response_text = response.text.strip()
+                # Save response if debug_dir is provided
+                if debug_dir:
+                    response_file = debug_dir / f"sequence_source_{campaign_id_str}_response_{timestamp}.txt"
+                    response_file.write_text(response_text)
+                # Parse the response
+                for line in response_text.split('\n'):
+                    if '->' in line:
+                        parts = line.split('->')
+                        if len(parts) == 2:
+                            target_enzyme = parts[0].strip()
+                            source_enzyme = parts[1].strip()
+                            if source_enzyme and source_enzyme != "None":
+                                # Find the target enzyme in our batch
+                                for entry in batch:
+                                    if entry['enzyme_id'] == target_enzyme:
+                                        # Verify the source enzyme exists and has a sequence
+                                        source_rows = df[df['enzyme_id'] == source_enzyme]
+                                        if not source_rows.empty:
+                                            source_seq = source_rows.iloc[0]['protein_sequence']
+                                            if source_seq and str(source_seq).strip() and str(source_seq) != "nan":
+                                                # Update the parent_enzyme_id if it's missing
+                                                if not entry['parent_id']:
+                                                    df.at[entry['idx'], 'parent_enzyme_id'] = source_enzyme
+                                                    df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_suggested_parent"
+                                                    identified_count += 1
+                                                    log.info(f"Set {source_enzyme} as parent for {target_enzyme} (Gemini suggestion)")
+                                                elif entry['parent_id'] != source_enzyme:
+                                                    # Log if Gemini suggests a different parent than what's recorded
+                                                    log.info(f"Gemini suggests {source_enzyme} as source for {target_enzyme}, "
+                                                           f"but parent is recorded as {entry['parent_id']}")
+                                        break
+            except Exception as e:
+                log.warning(f"Failed to identify sequence sources for batch {i//batch_size + 1}: {e}")
+                continue
+        if identified_count > 0:
+            log.info(f"Campaign {campaign_id_str}: Successfully identified {identified_count} sequence sources using Gemini")
+    return df
+# === 9. MAIN PROCESSOR === ---------------------------------------------------
 class SequenceProcessor:
     """Main processor for handling the complete workflow."""
-    def __init__(self, input_csv: Path, output_csv: Path):
+    def __init__(self, input_csv: Path, output_csv: Path, debug_dir: Optional[Path] = None, strict_mutation_validation: bool = True):
         self.input_csv = input_csv
         self.output_csv = output_csv
+        self.debug_dir = debug_dir
+        self.strict_mutation_validation = strict_mutation_validation
         self.df = None
         self.generator = None
@@ -593,7 +1151,7 @@ class SequenceProcessor:
             self.df["flag"] = ""
         # Initialize generator
-        self.generator = SequenceGenerator(self.df)
+        self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
     def _normalize_columns(self) -> None:
         """Automatically detect and normalize column names from different formats."""
@@ -855,7 +1413,7 @@ class SequenceProcessor:
             self.df = self.df[campaign_mask].copy()
             # Rebuild relationships for this campaign
-            self.generator = SequenceGenerator(self.df)
+            self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
             # Flag complex mutations
             self.flag_complex_mutations()
@@ -866,6 +1424,17 @@ class SequenceProcessor:
             self.process_remaining()
             self.backward_pass()
+            # Use Gemini to identify parent enzymes for entries with missing sequences
+            log.info(f"Identifying parents with Gemini for campaign: {campaign_id}")
+            self.df = identify_parents_with_gemini(self.df)
+            # Rebuild relationships after parent identification
+            self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
+            # Try to fill sequences again after parent identification
+            log.info(f"Attempting to fill sequences after parent identification for campaign: {campaign_id}")
+            self.process_remaining()
             # Update the original dataframe with results
             original_df.loc[campaign_mask, :] = self.df
@@ -874,6 +1443,50 @@ class SequenceProcessor:
             log.info(f"Completed campaign: {campaign_id}")
+        # After processing all campaigns, check for any remaining empty sequences
+        # and use Gemini to identify sequence sources (including cross-campaign relationships)
+        empty_count = sum(self.df["protein_sequence"].str.strip() == "")
+        total_count = len(self.df)
+        if empty_count > 0:
+            log.info(f"Found {empty_count}/{total_count} empty sequences after initial processing. "
+                     "Using Gemini to identify sequence sources (including cross-campaign relationships)...")
+            self.df = identify_sequence_sources_with_gemini(self.df, self.debug_dir)
+            # Process campaigns again after identifying new parent relationships
+            log.info("Reprocessing campaigns after sequence source identification...")
+            for campaign_id in campaigns:
+                if pd.isna(campaign_id):
+                    campaign_id = "unknown"
+                log.info(f"Reprocessing campaign: {campaign_id}")
+                # Filter data for this campaign
+                campaign_mask = self.df['campaign_id'] == campaign_id
+                if pd.isna(campaign_id):
+                    campaign_mask = self.df['campaign_id'].isna()
+                # Store original dataframe
+                original_df = self.df
+                # Process only this campaign's data
+                self.df = self.df[campaign_mask].copy()
+                # Rebuild relationships for this campaign
+                self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
+                # Try to fill sequences again
+                self.process_remaining()
+                # Update the original dataframe with results
+                original_df.loc[campaign_mask, :] = self.df
+                # Restore original dataframe
+                self.df = original_df
+                log.info(f"Completed reprocessing campaign: {campaign_id}")
         # Save results
         self.save_results()
@@ -922,6 +1535,16 @@ def main(argv: Optional[List[str]] = None) -> None:
         default=0,
         help="Increase verbosity (use -vv for debug output)"
     )
+    parser.add_argument(
+        "--debug-dir",
+        type=Path,
+        help="Directory to save debug information (Gemini prompts and responses)"
+    )
+    parser.add_argument(
+        "--allow-mutation-mismatches",
+        action="store_true",
+        help="Allow sequence generation even when mutations don't match (default: strict validation)"
+    )
     args = parser.parse_args(argv)
@@ -929,7 +1552,13 @@ def main(argv: Optional[List[str]] = None) -> None:
     setup_logging(args.verbose)
     # Process the data (format detection is automatic)
-    processor = SequenceProcessor(args.input_csv, args.output_csv)
+    strict_validation = not args.allow_mutation_mismatches
+    processor = SequenceProcessor(
+        args.input_csv,
+        args.output_csv,
+        getattr(args, 'debug_dir', None),
+        strict_mutation_validation=strict_validation
+    )
     processor.run()

debase 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

debase 0.4.2py3-none-any.whl → 0.4.4py3-none-any.whl