PyPI - debase - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

debase 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +151 -1
debase/enzyme_lineage_extractor.py +114 -20
debase/lineage_format.py +335 -56
debase/reaction_info_extractor.py +60 -32
debase/substrate_scope_extractor.py +366 -93
debase/wrapper.py +37 -11
{debase-0.4.1.dist-info → debase-0.4.3.dist-info}/METADATA +1 -1
debase-0.4.3.dist-info/RECORD +16 -0
debase-0.4.1.dist-info/RECORD +0 -16
{debase-0.4.1.dist-info → debase-0.4.3.dist-info}/WHEEL +0 -0
{debase-0.4.1.dist-info → debase-0.4.3.dist-info}/entry_points.txt +0 -0
{debase-0.4.1.dist-info → debase-0.4.3.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.1.dist-info → debase-0.4.3.dist-info}/top_level.txt +0 -0

debase/lineage_format.py CHANGED Viewed

@@ -52,6 +52,12 @@ try:
 except ImportError:  # pragma: no cover
     RDKIT_OK = False
+try:
+    import google.generativeai as genai  # type: ignore
+    GEMINI_OK = True
+except ImportError:  # pragma: no cover
+    GEMINI_OK = False
 # Input columns that MUST be present ------------------------------------------------
 INPUT_REQUIRED: Tuple[str, ...] = (
     "enzyme_id",
@@ -106,9 +112,13 @@ OUTPUT_COLUMNS: Tuple[str, ...] = (
     "x_coordinate",
     "y_coordinate",
     "fitness_value",
+    "fitness_type",
     "cofactor",
     "reaction_condition",
     "ee",
+    "campaign_id",
+    "generation",
+    "parent_enzyme_id",
     "additional_information",
 )
@@ -130,6 +140,9 @@ CACHE_DIR.mkdir(parents=True, exist_ok=True)
 # Local PubChem DB (optional) --------------------------------------------------------
 PUBCHEM_DB_PATH: Path = Path(__file__).parent.parent.parent / "data" / "iupac2smiles.db"
+# Gemini API configuration -----------------------------------------------------------
+GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
 # Miscellaneous ----------------------------------------------------------------------
 WELL_ROWS: str = "ABCDEFGH"  # 8 rows, 12 cols => 96 wells
@@ -231,9 +244,13 @@ class FlatRow:
     x_coordinate: str = ""
     y_coordinate: str = ""
     fitness_value: Optional[float] = None
+    fitness_type: str = ""
     cofactor: str = ""
     reaction_condition: str = ""
     ee: str = ""
+    campaign_id: str = ""
+    generation: str = ""
+    parent_enzyme_id: str = ""
     additional_information: str = ""
     def as_dict(self) -> Dict[str, str]:
@@ -253,9 +270,13 @@ class FlatRow:
             "x_coordinate": self.x_coordinate,
             "y_coordinate": self.y_coordinate,
             "fitness_value": self.fitness_value,
+            "fitness_type": self.fitness_type,
             "cofactor": self.cofactor,
             "reaction_condition": self.reaction_condition,
             "ee": self.ee,
+            "campaign_id": self.campaign_id,
+            "generation": self.generation,
+            "parent_enzyme_id": self.parent_enzyme_id,
             "additional_information": self.additional_information,
         }
         # Convert None to empty string for CSV friendliness
@@ -527,39 +548,224 @@ def _batch_convert(names: Sequence[str], is_substrate: bool) -> Dict[str, str]:
 # === 7. FLATTENING CORE ============================================================
 def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
-    """Fill missing sequences in substrate scope entries from lineage entries."""
-    # Create lookup for sequences by enzyme_id
+    """Fill missing sequences in substrate scope entries from reaction data entries.
+    This function:
+    1. First cleans up 3a data (lineage entries) to standardize enzyme_id column
+    2. Then populates sequences in 3b data (substrate scope) based on campaign_id + enzyme_id matching
+    """
+    # Step 1: Clean up 3a data format
+    log.info("Cleaning up reaction data (3a) format...")
+    # Handle column aliasing for enzyme_id
+    if 'enzyme' in df.columns and 'enzyme_id' not in df.columns:
+        df['enzyme_id'] = df['enzyme']
+        log.info("Renamed 'enzyme' column to 'enzyme_id' in reaction data")
+    # Step 2: Create sequence lookup from cleaned 3a data
     seq_lookup = {}
-    # First pass: collect all available sequences from lineage entries
-    for _, row in df.iterrows():
-        if row.get("data_type") == "lineage" or pd.notna(row.get("protein_sequence")) or pd.notna(row.get("aa_sequence")):
-            eid = str(row["enzyme_id"])
-            aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
-            nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", ""))
-            if aa_seq and aa_seq != "nan":
-                seq_lookup[eid] = {
-                    "aa_sequence": aa_seq,
-                    "nt_sequence": nt_seq if nt_seq != "nan" else ""
-                }
-    # Second pass: fill missing sequences in substrate scope entries
+    # Collect sequences from reaction data entries (3a) - these have data_type='lineage'
+    reaction_entries = df[df.get("data_type") == "lineage"]
+    log.info(f"Found {len(reaction_entries)} reaction data entries to extract sequences from")
+    for _, row in reaction_entries.iterrows():
+        eid = str(row["enzyme_id"])
+        campaign_id = str(row.get("campaign_id", "default"))
+        # Prioritize protein_sequence (from 3a) over aa_sequence (from lineage file)
+        aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
+        nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", "")) or str(row.get("dna_seq", ""))
+        if aa_seq and aa_seq != "nan" and aa_seq != "":
+            # Use campaign_id + enzyme_id as composite key for exact matching
+            composite_key = f"{campaign_id}_{eid}"
+            seq_lookup[composite_key] = {
+                "aa_sequence": aa_seq,
+                "nt_sequence": nt_seq if nt_seq != "nan" else "",
+                "campaign_id": campaign_id,
+                "enzyme_id": eid
+            }
+            # Also keep simple enzyme_id lookup as fallback
+            seq_lookup[eid] = {
+                "aa_sequence": aa_seq,
+                "nt_sequence": nt_seq if nt_seq != "nan" else "",
+                "campaign_id": campaign_id,
+                "enzyme_id": eid
+            }
+    log.info(f"Created sequence lookup with {len(seq_lookup)} entries from reaction data")
+    # Step 3: Fill missing sequences in substrate scope entries (3b)
+    substrate_entries = df[df.get("data_type") == "substrate_scope"]
+    log.info(f"Found {len(substrate_entries)} substrate scope entries to populate sequences for")
     filled_count = 0
     for idx, row in df.iterrows():
+        if row.get("data_type") != "substrate_scope":
+            continue
         eid = str(row["enzyme_id"])
+        campaign_id = str(row.get("campaign_id", "default"))
         # Check if this row needs sequence filling
         aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
-        if (not aa_seq or aa_seq == "nan") and eid in seq_lookup:
-            df.at[idx, "protein_sequence"] = seq_lookup[eid]["aa_sequence"]
-            df.at[idx, "aa_sequence"] = seq_lookup[eid]["aa_sequence"]
-            if seq_lookup[eid]["nt_sequence"]:
-                df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
-                df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
-            filled_count += 1
+        if not aa_seq or aa_seq == "nan" or aa_seq == "":
+            # Try campaign-specific lookup first (most precise match)
+            composite_key = f"{campaign_id}_{eid}"
+            if composite_key in seq_lookup:
+                df.at[idx, "protein_sequence"] = seq_lookup[composite_key]["aa_sequence"]
+                df.at[idx, "aa_sequence"] = seq_lookup[composite_key]["aa_sequence"]
+                if seq_lookup[composite_key]["nt_sequence"]:
+                    df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
+                    df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
+                filled_count += 1
+                log.debug(f"Filled sequence for {eid} in campaign {campaign_id} (exact match)")
+            # Fallback to enzyme_id only lookup
+            elif eid in seq_lookup:
+                df.at[idx, "protein_sequence"] = seq_lookup[eid]["aa_sequence"]
+                df.at[idx, "aa_sequence"] = seq_lookup[eid]["aa_sequence"]
+                if seq_lookup[eid]["nt_sequence"]:
+                    df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
+                    df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
+                filled_count += 1
+                log.debug(f"Filled sequence for {eid} (fallback lookup)")
+            else:
+                log.warning(f"No sequence found for enzyme_id={eid} in campaign {campaign_id}")
     if filled_count > 0:
-        log.info(f"Filled sequences for {filled_count} entries")
+        log.info(f"Successfully filled sequences for {filled_count} substrate scope entries")
+    return df
+def _identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
+    """Use Gemini API to identify parent enzymes for entries with missing parent information."""
+    if not GEMINI_OK:
+        log.warning("Gemini API not available (missing google.generativeai). Skipping parent identification.")
+        return df
+    if not GEMINI_API_KEY:
+        log.warning("GEMINI_API_KEY not set. Skipping parent identification.")
+        return df
+    try:
+        genai.configure(api_key=GEMINI_API_KEY)
+        model = genai.GenerativeModel('gemini-1.5-flash')
+    except Exception as e:
+        log.warning(f"Failed to configure Gemini API: {e}. Skipping parent identification.")
+        return df
+    # Find entries with empty sequences but missing parent information
+    entries_needing_parents = []
+    for idx, row in df.iterrows():
+        aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
+        nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", ""))
+        parent_id = str(row.get("parent_enzyme_id", "")).strip()
+        # Only process entries that have empty sequences AND no parent info
+        if (not aa_seq or aa_seq == "nan" or aa_seq == "") and (not nt_seq or nt_seq == "nan" or nt_seq == "") and (not parent_id or parent_id == "nan"):
+            enzyme_id = str(row.get("enzyme_id", ""))
+            campaign_id = str(row.get("campaign_id", ""))
+            generation = str(row.get("generation", ""))
+            entries_needing_parents.append({
+                "idx": idx,
+                "enzyme_id": enzyme_id,
+                "campaign_id": campaign_id,
+                "generation": generation
+            })
+    if not entries_needing_parents:
+        log.info("No entries need parent identification from Gemini")
+        return df
+    log.info(f"Found {len(entries_needing_parents)} entries needing parent identification. Querying Gemini...")
+    # Create a lookup of all available enzyme IDs for context
+    available_enzymes = {}
+    for idx, row in df.iterrows():
+        enzyme_id = str(row.get("enzyme_id", ""))
+        campaign_id = str(row.get("campaign_id", ""))
+        aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
+        generation = str(row.get("generation", ""))
+        if enzyme_id and enzyme_id != "nan":
+            available_enzymes[enzyme_id] = {
+                "campaign_id": campaign_id,
+                "has_sequence": bool(aa_seq and aa_seq != "nan" and aa_seq != ""),
+                "generation": generation
+            }
+    identified_count = 0
+    for entry in entries_needing_parents:
+        enzyme_id = entry["enzyme_id"]
+        campaign_id = entry["campaign_id"]
+        generation = entry["generation"]
+        # Create context for Gemini
+        context_info = []
+        context_info.append(f"Enzyme ID: {enzyme_id}")
+        context_info.append(f"Campaign ID: {campaign_id}")
+        if generation:
+            context_info.append(f"Generation: {generation}")
+        # Add available enzymes from the same campaign for context
+        campaign_enzymes = []
+        for enz_id, enz_data in available_enzymes.items():
+            if enz_data["campaign_id"] == campaign_id:
+                status = "with sequence" if enz_data["has_sequence"] else "without sequence"
+                gen_info = f"(gen {enz_data['generation']})" if enz_data["generation"] else ""
+                campaign_enzymes.append(f"  - {enz_id} {status} {gen_info}")
+        if campaign_enzymes:
+            context_info.append("Available enzymes in same campaign:")
+            context_info.extend(campaign_enzymes[:10])  # Limit to first 10 for context
+        context_text = "\n".join(context_info)
+        prompt = f"""
+Based on the enzyme information provided, can you identify the parent enzyme for this enzyme?
+{context_text}
+This enzyme currently has no sequence data and no parent information. Based on the enzyme ID and the available enzymes in the same campaign, can you identify which enzyme is likely the parent?
+Please provide your response in this format:
+Parent: [parent_enzyme_id or "Unknown"]
+If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
+"""
+        try:
+            response = model.generate_content(prompt)
+            response_text = response.text.strip()
+            # Parse the response
+            parent_match = re.search(r'Parent:\s*([^\n]+)', response_text)
+            if parent_match:
+                parent = parent_match.group(1).strip()
+                if parent and parent != "Unknown" and parent != "No parent identified":
+                    # Verify the parent exists in our available enzymes
+                    if parent in available_enzymes:
+                        df.at[entry["idx"], "parent_enzyme_id"] = parent
+                        identified_count += 1
+                        log.info(f"Identified parent for {enzyme_id}: {parent}")
+                    else:
+                        log.warning(f"Gemini suggested parent {parent} for {enzyme_id}, but it's not in available enzymes")
+        except Exception as e:
+            log.warning(f"Failed to identify parent for {enzyme_id} from Gemini: {e}")
+            continue
+    if identified_count > 0:
+        log.info(f"Successfully identified {identified_count} parent enzymes using Gemini API")
+    else:
+        log.info("No parent enzymes were identified using Gemini API")
     return df
@@ -574,7 +780,7 @@ def _plate_and_well(index: int) -> Tuple[int, str, str]:
     return plate_number, plate_name, well
-def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str]) -> str:
+def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str], campaign_id: str = "default") -> str:
     """Get root enzyme id, falling back to generation 0 ancestor or self."""
     if eid in lineage_roots:
         return lineage_roots[eid]
@@ -582,7 +788,12 @@ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: D
     seen: set[str] = set()
     while cur and cur not in seen:
         seen.add(cur)
+        # Try campaign-specific lookup first, then fall back to composite key
         row = idmap.get(cur, {})
+        if not row:
+            composite_key = f"{campaign_id}_{cur}"
+            row = idmap.get(composite_key, {})
         # Look for generation 0 as the root
         if str(row.get("generation", "")).strip() == "0":
             return cur
@@ -674,6 +885,12 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     # Fill missing sequences in substrate scope entries from lineage data
     df = _fill_missing_sequences(df)
+    # Use Gemini API to identify parent enzymes for entries with missing sequences
+    df = _identify_parents_with_gemini(df)
+    # Fill sequences again after parent identification to propagate sequences from identified parents
+    df = _fill_missing_sequences(df)
     # 1. Generate lineage roots once -----------------------------------------
     lineage_roots = _generate_lineage_roots(df)
@@ -694,24 +911,42 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     # _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
     # 3. Flatten rows ---------------------------------------------------------
-    # Create idmap for parent lookups, but note this will only keep last occurrence of duplicates
+    # Create idmap for parent lookups, using campaign_id + enzyme_id as composite key
     idmap = {}
+    campaign_idmap = {}  # For within-campaign lookups
     for _, r in df.iterrows():
         eid = str(r["enzyme_id"])
-        if eid in idmap:
-            log.debug(f"Overwriting duplicate enzyme_id in idmap: {eid}")
-        idmap[eid] = r.to_dict()
-    # Check for duplicate enzyme_ids
-    enzyme_ids = [str(r["enzyme_id"]) for _, r in df.iterrows()]
-    unique_ids = set(enzyme_ids)
-    if len(enzyme_ids) != len(unique_ids):
-        log.warning(f"Found duplicate enzyme_ids! Total: {len(enzyme_ids)}, Unique: {len(unique_ids)}")
-        from collections import Counter
+        campaign_id = str(r.get("campaign_id", "default"))
+        # Use composite key for global idmap
+        composite_key = f"{campaign_id}_{eid}"
+        idmap[composite_key] = r.to_dict()
+        # Also maintain campaign-specific idmap for parent lookups
+        if campaign_id not in campaign_idmap:
+            campaign_idmap[campaign_id] = {}
+        campaign_idmap[campaign_id][eid] = r.to_dict()
+    # Check for duplicate enzyme_ids within campaigns
+    from collections import defaultdict, Counter
+    campaign_enzyme_counts = defaultdict(list)
+    for _, r in df.iterrows():
+        eid = str(r["enzyme_id"])
+        campaign_id = str(r.get("campaign_id", "default"))
+        campaign_enzyme_counts[campaign_id].append(eid)
+    total_duplicates = 0
+    for campaign_id, enzyme_ids in campaign_enzyme_counts.items():
         id_counts = Counter(enzyme_ids)
         duplicates = {k: v for k, v in id_counts.items() if v > 1}
-        log.warning(f"Duplicate enzyme_ids: {duplicates}")
-        log.info("Note: All rows will still be processed, but parent lookups may use the last occurrence of duplicate IDs")
+        if duplicates:
+            total_duplicates += sum(duplicates.values()) - len(duplicates)
+            log.warning(f"Campaign {campaign_id} has duplicate enzyme_ids: {duplicates}")
+    if total_duplicates > 0:
+        log.warning(f"Found {total_duplicates} duplicate enzyme_ids across campaigns")
+        log.info("All entries within each campaign will be preserved")
     output_rows: List[Dict[str, str]] = []
     skipped_count = 0
@@ -747,23 +982,58 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         smiles_reaction = ".".join(sub_smiles) + " >> " + ".".join(prod_smiles)
         smiles_string = _canonical_smiles(smiles_string)
-        # Mutations -----------------------------------------------------------
-        root_id = _root_enzyme_id(eid, idmap, lineage_roots)
-        root_row = idmap[root_id]
-        root_aa = (
-            str(root_row.get("protein_sequence", ""))
-            or str(root_row.get("aa_sequence", ""))
-        )
-        root_nt = (
-            str(root_row.get("nucleotide_sequence", ""))
-            or str(root_row.get("nt_sequence", ""))
-        )
-        # If root doesn't have NT sequence but has AA sequence, reverse translate
-        if (not root_nt or root_nt == "nan") and root_aa:
-            root_nt = _rev_translate(root_aa)
+        # Mutations - calculate based on generation 0 enzyme in same campaign --------
+        campaign_id = str(rec.row.get("campaign_id", "default"))
+        generation = str(rec.row.get("generation", "")).strip()
+        parent_id = rec.parent_id
+        # Find generation 0 enzyme in same campaign as reference (only for non-gen-0 enzymes)
+        reference_row = {}
+        if generation != "0":
+            for cid, cmap in campaign_idmap.items():
+                if cid == campaign_id:
+                    for enzyme_id, enzyme_row in cmap.items():
+                        enzyme_gen = str(enzyme_row.get("generation", "")).strip()
+                        if enzyme_gen == "0" or enzyme_gen == "0.0":
+                            reference_row = enzyme_row
+                            log.debug(f"Found generation 0 enzyme {enzyme_id} as reference for {eid}")
+                            break
+                    break
+            if not reference_row:
+                log.warning(f"No generation 0 enzyme found in campaign {campaign_id} for {eid}")
-        aa_muts = _aa_mut(root_aa, rec.aa_seq) if rec.aa_seq and root_aa else ""
-        nt_muts = _nt_mut(root_aa, rec.aa_seq, root_nt, rec.nt_seq) if root_aa or root_nt else ""
+        reference_aa = ""
+        reference_nt = ""
+        if reference_row:
+            reference_aa = (
+                str(reference_row.get("protein_sequence", ""))
+                or str(reference_row.get("aa_sequence", ""))
+            )
+            reference_nt = (
+                str(reference_row.get("nucleotide_sequence", ""))
+                or str(reference_row.get("nt_sequence", ""))
+            )
+            # If reference doesn't have NT sequence but has AA sequence, reverse translate
+            if (not reference_nt or reference_nt == "nan") and reference_aa and reference_aa != "nan":
+                reference_nt = _rev_translate(reference_aa)
+        # For generation 0 enzymes, don't calculate mutations (they are the reference)
+        if generation == "0":
+            aa_muts = ""
+            nt_muts = ""
+            log.info(f"Generation 0 enzyme {eid} - no mutations calculated (is reference)")
+        else:
+            # Debug sequence availability
+            log.info(f"Mutation calc for {eid}: gen={generation}, has_ref_aa={bool(reference_aa and reference_aa != 'nan')}, has_rec_aa={bool(rec.aa_seq and rec.aa_seq != 'nan')}")
+            # Calculate mutations relative to generation 0 reference
+            aa_muts = _aa_mut(reference_aa, rec.aa_seq) if rec.aa_seq and rec.aa_seq != "nan" and reference_aa and reference_aa != "nan" else ""
+            nt_muts = _nt_mut(reference_aa, rec.aa_seq, reference_nt, rec.nt_seq) if (reference_aa and reference_aa != "nan") or (reference_nt and reference_nt != "nan") else ""
+            if aa_muts or nt_muts:
+                log.info(f"Calculated mutations for {eid} relative to generation 0: AA={aa_muts}, NT={nt_muts}")
+            else:
+                log.warning(f"No mutations calculated for {eid} - ref_aa_len={len(reference_aa) if reference_aa else 0}, rec_aa_len={len(rec.aa_seq) if rec.aa_seq else 0}")
         # Plate / well --------------------------------------------------------
         barcode_plate, plate_name, well = _plate_and_well(idx)
@@ -785,13 +1055,18 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         cof_list = str(row.get("cofactor_list", "")).strip()
         cofactor = cof_iupac or cof_list
+        # Fitness type -------------------------------------------------------
+        fitness_type = ""
+        if rec.ttn_or_yield() is not None:
+            ttn_val = row.get("ttn")
+            fitness_type = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
         # Additional info -----------------------------------------------------
         extra: Dict[str, str] = {
             k: str(v) for k, v in row.items() if k not in INPUT_REQUIRED + OPTIONAL_INPUT
         }
-        if rec.ttn_or_yield() is not None:
-            ttn_val = row.get("ttn")
-            extra["fitness_type"] = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
+        # Don't include fitness_type in additional_information since it's now a separate column
+        extra.pop("fitness_type", None)
         additional_information = json.dumps(extra, separators=(",", ":")) if extra else ""
         flat = FlatRow(
@@ -806,9 +1081,13 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
             nt_sequence=rec.nt_seq,
             aa_sequence=rec.aa_seq,
             fitness_value=rec.ttn_or_yield(),
+            fitness_type=fitness_type,
             cofactor=cofactor,
             reaction_condition=reaction_condition,
             ee=str(row.get("ee", "")),
+            campaign_id=campaign_id,
+            generation=generation,
+            parent_enzyme_id=parent_id,
             additional_information=additional_information,
         )
         output_rows.append(flat.as_dict())

debase/reaction_info_extractor.py CHANGED Viewed

@@ -1332,12 +1332,28 @@ class ReactionExtractor:
             y_offset += pix.height * scale
         # Convert the page to a pixmap
-        mat = fitz.Matrix(5.0, 5.0)  # 5x zoom for quality
+        # Limit zoom factor to avoid creating excessively large images
+        # Gemini has limits on image size (approx 20MB or 20 megapixels)
+        zoom = 5.0
+        estimated_pixels = (max_width * zoom) * (total_height * zoom)
+        max_pixels = 20_000_000  # 20 megapixels
+        if estimated_pixels > max_pixels:
+            # Calculate appropriate zoom to stay under limit
+            zoom = min(5.0, (max_pixels / (max_width * total_height)) ** 0.5)
+            LOGGER.warning(f"Reducing zoom from 5.0 to {zoom:.2f} to stay under {max_pixels/1e6:.1f} megapixel limit")
+        mat = fitz.Matrix(zoom, zoom)
         combined_pix = page.get_pixmap(matrix=mat)
         combined_pix = self._ensure_rgb_pixmap(combined_pix)
         # Convert to PNG and return
         img_bytes = combined_pix.tobytes("png")
+        # Check final size
+        final_size_mb = len(img_bytes) / (1024 * 1024)
+        if final_size_mb > 20:
+            LOGGER.warning(f"Combined image is {final_size_mb:.1f}MB, may be too large for vision API")
         output_doc.close()
         # Save debug file if available
@@ -2317,39 +2333,51 @@ Different campaigns may use different model reactions and substrates.
                         }
                     )
-                    response = model.generate_content(content_parts)
-                    # Track token usage if available
                     try:
-                        if hasattr(response, 'usage_metadata'):
-                            input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
-                            output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
-                            if input_tokens or output_tokens:
-                                try:
-                                    from .wrapper import add_token_usage
-                                    add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
-                                except ImportError:
-                                    pass  # wrapper not available
-                    except Exception:
-                        pass  # token tracking is best-effort
-                    # Parse JSON from response
-                    if response and response.text:
-                        # Save debug output
-                        if self.debug_dir:
-                            timestamp = int(time.time())
-                            _dump(prompt, self.debug_dir / f"model_reaction_multimodal_prompt_{timestamp}.txt")
-                            _dump(response.text, self.debug_dir / f"model_reaction_multimodal_response_{timestamp}.txt")
+                        response = model.generate_content(content_parts)
-                        # Extract JSON from response
-                        text = response.text.strip()
-                        if text.startswith("```json"):
-                            text = text[7:]
-                        if text.endswith("```"):
-                            text = text[:-3]
-                        data = json.loads(text.strip())
-                    else:
-                        raise ValueError("Empty response from multimodal model")
+                        # Track token usage if available
+                        try:
+                            if hasattr(response, 'usage_metadata'):
+                                input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
+                                output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
+                                if input_tokens or output_tokens:
+                                    try:
+                                        from .wrapper import add_token_usage
+                                        add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
+                                    except ImportError:
+                                        pass  # wrapper not available
+                        except Exception:
+                            pass  # token tracking is best-effort
+                        # Parse JSON from response
+                        if response and response.text:
+                            # Save debug output
+                            if self.debug_dir:
+                                timestamp = int(time.time())
+                                _dump(prompt, self.debug_dir / f"model_reaction_multimodal_prompt_{timestamp}.txt")
+                                _dump(response.text, self.debug_dir / f"model_reaction_multimodal_response_{timestamp}.txt")
+                            # Extract JSON from response
+                            text = response.text.strip()
+                            if text.startswith("```json"):
+                                text = text[7:]
+                            if text.endswith("```"):
+                                text = text[:-3]
+                            data = json.loads(text.strip())
+                        else:
+                            raise ValueError("Empty response from multimodal model")
+                    except Exception as vision_error:
+                        LOGGER.error("Vision API call failed: %s", vision_error)
+                        LOGGER.info("Falling back to text-only extraction")
+                        # Fall back to text-only extraction
+                        data = generate_json_with_retry(
+                            self.model,
+                            prompt,
+                            temperature=self.cfg.model_reaction_temperature,
+                            debug_dir=self.debug_dir,
+                            tag="model_reaction_fallback"
+                        )
                 else:
                     # Fall back to text-only extraction
                     data = generate_json_with_retry(

debase 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

debase 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl