PyPI - debase - Versions diffs - 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

debase 0.4.2py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +656 -27
debase/enzyme_lineage_extractor.py +1077 -109
debase/lineage_format.py +221 -12
debase/reaction_info_extractor.py +133 -23
debase/substrate_scope_extractor.py +49 -2
debase/wrapper.py +155 -151
debase-0.4.4.dist-info/METADATA +121 -0
debase-0.4.4.dist-info/RECORD +16 -0
debase-0.4.2.dist-info/METADATA +0 -296
debase-0.4.2.dist-info/RECORD +0 -16
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/WHEEL +0 -0
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/entry_points.txt +0 -0
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/top_level.txt +0 -0

debase/lineage_format.py CHANGED Viewed

@@ -553,6 +553,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
     This function:
     1. First cleans up 3a data (lineage entries) to standardize enzyme_id column
     2. Then populates sequences in 3b data (substrate scope) based on campaign_id + enzyme_id matching
+    3. Uses Gemini API for intelligent matching when exact matches fail
     """
     # Step 1: Clean up 3a data format
     log.info("Cleaning up reaction data (3a) format...")
@@ -564,6 +565,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
     # Step 2: Create sequence lookup from cleaned 3a data
     seq_lookup = {}
+    campaign_enzymes = {}  # Track enzymes by campaign for Gemini matching
     # Collect sequences from reaction data entries (3a) - these have data_type='lineage'
     reaction_entries = df[df.get("data_type") == "lineage"]
@@ -584,7 +586,9 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
                 "aa_sequence": aa_seq,
                 "nt_sequence": nt_seq if nt_seq != "nan" else "",
                 "campaign_id": campaign_id,
-                "enzyme_id": eid
+                "enzyme_id": eid,
+                "generation": str(row.get("generation", "")),
+                "parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
             }
             # Also keep simple enzyme_id lookup as fallback
@@ -592,16 +596,41 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
                 "aa_sequence": aa_seq,
                 "nt_sequence": nt_seq if nt_seq != "nan" else "",
                 "campaign_id": campaign_id,
-                "enzyme_id": eid
+                "enzyme_id": eid,
+                "generation": str(row.get("generation", "")),
+                "parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
             }
+            # Track enzymes by campaign for Gemini matching
+            if campaign_id not in campaign_enzymes:
+                campaign_enzymes[campaign_id] = []
+            campaign_enzymes[campaign_id].append({
+                "enzyme_id": eid,
+                "has_sequence": True,
+                "generation": str(row.get("generation", "")),
+                "parent_id": str(row.get("parent_enzyme_id", ""))
+            })
     log.info(f"Created sequence lookup with {len(seq_lookup)} entries from reaction data")
+    # Setup Gemini if available
+    gemini_model = None
+    if GEMINI_OK and GEMINI_API_KEY:
+        try:
+            genai.configure(api_key=GEMINI_API_KEY)
+            gemini_model = genai.GenerativeModel('gemini-1.5-flash')
+            log.info("Gemini API configured for intelligent enzyme matching")
+        except Exception as e:
+            log.warning(f"Failed to configure Gemini API: {e}")
     # Step 3: Fill missing sequences in substrate scope entries (3b)
     substrate_entries = df[df.get("data_type") == "substrate_scope"]
     log.info(f"Found {len(substrate_entries)} substrate scope entries to populate sequences for")
     filled_count = 0
+    gemini_matched_count = 0
+    unmatched_enzymes = []  # Track enzymes that need Gemini matching
     for idx, row in df.iterrows():
         if row.get("data_type") != "substrate_scope":
             continue
@@ -620,6 +649,8 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
                 if seq_lookup[composite_key]["nt_sequence"]:
                     df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
                     df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
+                df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
+                df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
                 filled_count += 1
                 log.debug(f"Filled sequence for {eid} in campaign {campaign_id} (exact match)")
@@ -630,18 +661,182 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
                 if seq_lookup[eid]["nt_sequence"]:
                     df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
                     df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
+                df.at[idx, "generation"] = seq_lookup[eid]["generation"]
+                df.at[idx, "parent_enzyme_id"] = seq_lookup[eid]["parent_enzyme_id"]
                 filled_count += 1
                 log.debug(f"Filled sequence for {eid} (fallback lookup)")
             else:
-                log.warning(f"No sequence found for enzyme_id={eid} in campaign {campaign_id}")
+                # Collect for Gemini matching
+                unmatched_enzymes.append({
+                    "idx": idx,
+                    "enzyme_id": eid,
+                    "campaign_id": campaign_id
+                })
+    # Step 4: Use Gemini for intelligent matching of unmatched enzymes
+    if unmatched_enzymes and gemini_model:
+        log.info(f"Using Gemini to intelligently match {len(unmatched_enzymes)} unmatched enzymes")
+        # Group unmatched enzymes by campaign
+        unmatched_by_campaign = {}
+        for entry in unmatched_enzymes:
+            cid = entry["campaign_id"]
+            if cid not in unmatched_by_campaign:
+                unmatched_by_campaign[cid] = []
+            unmatched_by_campaign[cid].append(entry)
+        # Process each campaign
+        for campaign_id, entries in unmatched_by_campaign.items():
+            if campaign_id not in campaign_enzymes or not campaign_enzymes[campaign_id]:
+                log.warning(f"No enzymes with sequences found in campaign {campaign_id}")
+                continue
+            # Get enzyme IDs that need matching
+            unmatched_ids = [e["enzyme_id"] for e in entries]
+            # Get available enzymes in this campaign
+            available_ids = [e["enzyme_id"] for e in campaign_enzymes[campaign_id] if e["has_sequence"]]
+            if not available_ids:
+                log.warning(f"No enzymes with sequences available in campaign {campaign_id}")
+                continue
+            # Create prompt for Gemini
+            prompt = f"""Match enzyme variant IDs from substrate scope data to their corresponding sequences in reaction data.
+These are from the same campaign ({campaign_id}) but may use slightly different naming conventions.
+Enzymes needing sequences (from substrate scope):
+{json.dumps(unmatched_ids, indent=2)}
+Enzymes with sequences available (from reaction data):
+{json.dumps(available_ids, indent=2)}
+Match each enzyme from the first list to its corresponding enzyme in the second list.
+Consider variations like:
+- Case differences (p411-hf vs P411-HF)
+- Underscore vs hyphen (p411_hf vs p411-hf)
+- Additional prefixes/suffixes
+- Similar naming patterns within the campaign
+Return ONLY a JSON object mapping substrate scope IDs to reaction data IDs:
+{{"substrate_scope_id": "reaction_data_id", ...}}
+Only include matches you are confident about. If no match exists, omit that enzyme.
+"""
+            try:
+                response = gemini_model.generate_content(prompt)
+                mapping_text = response.text.strip()
+                # Extract JSON from response
+                if '```json' in mapping_text:
+                    mapping_text = mapping_text.split('```json')[1].split('```')[0].strip()
+                elif '```' in mapping_text:
+                    mapping_text = mapping_text.split('```')[1].split('```')[0].strip()
+                mapping = json.loads(mapping_text)
+                # Apply the matches
+                for entry in entries:
+                    substrate_id = entry["enzyme_id"]
+                    if substrate_id in mapping:
+                        matched_id = mapping[substrate_id]
+                        composite_key = f"{campaign_id}_{matched_id}"
+                        if composite_key in seq_lookup:
+                            idx = entry["idx"]
+                            df.at[idx, "protein_sequence"] = seq_lookup[composite_key]["aa_sequence"]
+                            df.at[idx, "aa_sequence"] = seq_lookup[composite_key]["aa_sequence"]
+                            if seq_lookup[composite_key]["nt_sequence"]:
+                                df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
+                                df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
+                            # Also copy generation and parent_enzyme_id
+                            df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
+                            df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
+                            # Store the match for later mutation copying
+                            df.at[idx, "_matched_enzyme_id"] = matched_id
+                            df.at[idx, "_matched_campaign_id"] = campaign_id
+                            gemini_matched_count += 1
+                            log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
+                        else:
+                            log.warning(f"Gemini suggested match '{matched_id}' not found in sequence lookup")
+            except Exception as e:
+                log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
-    if filled_count > 0:
-        log.info(f"Successfully filled sequences for {filled_count} substrate scope entries")
+    # Final logging
+    total_filled = filled_count + gemini_matched_count
+    if total_filled > 0:
+        log.info(f"Successfully filled sequences for {total_filled} substrate scope entries "
+                f"({filled_count} exact matches, {gemini_matched_count} Gemini matches)")
+    # Log any remaining unmatched
+    for entry in unmatched_enzymes:
+        if not any(df.at[entry["idx"], col] for col in ["protein_sequence", "aa_sequence"]
+                  if col in df.columns and df.at[entry["idx"], col]):
+            log.warning(f"No sequence found for enzyme_id={entry['enzyme_id']} in campaign {entry['campaign_id']}")
     return df
+def _copy_mutations_from_matched_enzymes(out_df: pd.DataFrame, orig_df: pd.DataFrame) -> pd.DataFrame:
+    """Copy nucleotide_mutation and amino_acid_substitutions from matched enzymes.
+    This function looks for entries that were matched by Gemini and copies their
+    mutation information from the corresponding matched enzyme.
+    """
+    # Look for entries with _matched_enzyme_id (these were matched by Gemini)
+    if "_matched_enzyme_id" not in orig_df.columns:
+        return out_df
+    matched_entries = orig_df[orig_df["_matched_enzyme_id"].notna()]
+    if len(matched_entries) == 0:
+        return out_df
+    log.info(f"Copying mutations for {len(matched_entries)} Gemini-matched entries")
+    # Create a lookup of mutations from the output dataframe
+    mutation_lookup = {}
+    for idx, row in out_df.iterrows():
+        key = f"{row['campaign_id']}_{row['id']}"  # 'id' is the enzyme_id in output
+        mutation_lookup[key] = {
+            "nucleotide_mutation": row.get("nucleotide_mutation", ""),
+            "amino_acid_substitutions": row.get("amino_acid_substitutions", "")
+        }
+    # Copy mutations for matched entries
+    mutations_copied = 0
+    for idx, row in out_df.iterrows():
+        # Check if this row needs mutation copying
+        # Find the original row in orig_df with the same enzyme_id and campaign_id
+        orig_mask = (orig_df["enzyme_id"] == row["id"]) & (orig_df["campaign_id"] == row["campaign_id"])
+        orig_rows = orig_df[orig_mask]
+        if len(orig_rows) > 0 and "_matched_enzyme_id" in orig_rows.columns:
+            orig_row = orig_rows.iloc[0]
+            if pd.notna(orig_row.get("_matched_enzyme_id")):
+                # This was a Gemini-matched entry
+                matched_id = orig_row["_matched_enzyme_id"]
+                matched_campaign = orig_row["_matched_campaign_id"]
+                lookup_key = f"{matched_campaign}_{matched_id}"
+                if lookup_key in mutation_lookup:
+                    out_df.at[idx, "nucleotide_mutation"] = mutation_lookup[lookup_key]["nucleotide_mutation"]
+                    out_df.at[idx, "amino_acid_substitutions"] = mutation_lookup[lookup_key]["amino_acid_substitutions"]
+                    mutations_copied += 1
+                    log.debug(f"Copied mutations for {row['id']} from {matched_id}")
+    if mutations_copied > 0:
+        log.info(f"Successfully copied mutations for {mutations_copied} entries")
+    return out_df
 def _identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
     """Use Gemini API to identify parent enzymes for entries with missing parent information."""
     if not GEMINI_OK:
@@ -885,11 +1080,7 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     # Fill missing sequences in substrate scope entries from lineage data
     df = _fill_missing_sequences(df)
-    # Use Gemini API to identify parent enzymes for entries with missing sequences
-    df = _identify_parents_with_gemini(df)
-    # Fill sequences again after parent identification to propagate sequences from identified parents
-    df = _fill_missing_sequences(df)
+    # Note: Removed parent identification - we only want exact variant matching
     # 1. Generate lineage roots once -----------------------------------------
     lineage_roots = _generate_lineage_roots(df)
@@ -1095,6 +1286,10 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     log.info(f"Flattening complete: {processed_count} rows processed, {skipped_count} rows skipped")
     out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
+    # Post-process: Copy mutations from matched enzymes for Gemini-matched substrate scope entries
+    out_df = _copy_mutations_from_matched_enzymes(out_df, df)
     return out_df
@@ -1137,10 +1332,24 @@ def run_pipeline(reaction_csv: str | Path | None = None,
     if not dfs:
         raise ValueError("At least one input CSV must be provided")
-    # Combine dataframes
+    # Combine dataframes with deduplication
     if len(dfs) > 1:
         df_in = pd.concat(dfs, ignore_index=True)
-        log.info("Combined data: %d total entries", len(df_in))
+        log.info("Combined data: %d total entries (before deduplication)", len(df_in))
+        # Deduplicate based on unique combination of campaign, variant, fitness, and product
+        # Define the key columns that should be unique
+        unique_cols = ['campaign_id', 'enzyme_id', 'product_list']
+        # Check if we have these columns
+        available_cols = [col for col in unique_cols if col in df_in.columns]
+        if len(available_cols) >= 2:  # Need at least campaign_id and enzyme_id
+            # Keep the first occurrence of each unique combination
+            df_in = df_in.drop_duplicates(subset=available_cols, keep='first')
+            log.info("After deduplication on %s: %d entries", available_cols, len(df_in))
+        else:
+            log.warning("Could not deduplicate - missing required columns: %s", unique_cols)
     else:
         df_in = dfs[0]

debase/reaction_info_extractor.py CHANGED Viewed

@@ -58,7 +58,7 @@ class Config:
     extract_temperature: float = 0.0
     model_reaction_temperature: float = 0.0
     top_p: float = 1.0
-    max_tokens: int = 4096
+    max_tokens: int = 12288  # Increased 3x from 4096
     pdf_cache_size: int = 8
     retries: int = 2
@@ -209,7 +209,7 @@ def _cached_gemini_call(
                 parts,
                 generation_config={
                     "temperature": temperature,
-                    "max_output_tokens": 8192,
+                    "max_output_tokens": 24576,  # Increased 3x from 8192
                 }
             )
             # Track token usage if available
@@ -450,7 +450,7 @@ Respond with a JSON array where each element contains:
 - "lineage_hint": any indication of which enzyme group this data is for (or null)
 - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
-Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information.
+Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
 Do not include too much sources, just return 2 or 3 sources.
 Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
 When returning confidence scores, be more accurate and avoid scores that are too close together.
@@ -703,6 +703,14 @@ CRITICAL - NO HALLUCINATION:
 - If no IUPAC name is found for a compound, return null for iupac_name
 - Include ALL compounds found or referenced
+IMPORTANT - ONE NAME PER COMPOUND:
+- Return ONLY ONE IUPAC name per compound identifier
+- If multiple names are found for the same compound, choose the one most likely to be the IUPAC name:
+  1. Names explicitly labeled as "IUPAC name:" in the text
+  2. Names in compound characterization sections
+  3. The most systematic/complete chemical name
+- Do NOT return multiple IUPAC names in a single iupac_name field
 Return as JSON:
 {
   "compound_mappings": [
@@ -1907,8 +1915,14 @@ TEXT FROM MANUSCRIPT:
                     f.write(prompt)
                 LOGGER.info("Full prompt saved to: %s", prompt_file)
-            # Make multimodal API call
-            response = self.model.generate_content(content_parts)
+            # Make multimodal API call with increased token limit
+            response = self.model.generate_content(
+                content_parts,
+                generation_config={
+                    "temperature": 0.0,
+                    "max_output_tokens": 24576,  # Increased 3x for compound mapping
+                }
+            )
             # Track token usage if available
             try:
@@ -1971,6 +1985,7 @@ TEXT FROM MANUSCRIPT:
         compound_ids: List[str],
         initial_sections: List[str] = None,
         campaign_filter: Optional[str] = None,
+        iupac_location_hint: Optional[Dict[str, Any]] = None,
     ) -> Dict[str, CompoundMapping]:
         """Extract compound ID to IUPAC name mappings using simplified 2-tier strategy.
@@ -2002,14 +2017,57 @@ TEXT FROM MANUSCRIPT:
         LOGGER.info("Starting adaptive compound mapping for %d uncached compounds: %s",
                    len(uncached_compound_ids), sorted(uncached_compound_ids))
-        # Tier 1: Standard sections (manuscript + initial SI sections)
-        initial_sections = initial_sections or [
-            "General procedure", "Compound characterization",
-            "Synthesis", "Experimental", "Materials and methods"
-        ]
-        # Include manuscript pages (first 10) for model reaction context
-        manuscript_text = "\n\n".join(self.ms_pages[:10])
+        # Tier 1: Use IUPAC location hint if provided, otherwise standard sections
+        if iupac_location_hint and iupac_location_hint.get('location'):
+            LOGGER.info("Tier 1: Using IUPAC location hint: %s", iupac_location_hint.get('location'))
+            if iupac_location_hint.get('compound_section_hint'):
+                LOGGER.info("Tier 1: Compound section hint: %s", iupac_location_hint.get('compound_section_hint'))
+            # Extract text from the specific IUPAC location
+            iupac_text = self._get_extended_text_around_location(
+                iupac_location_hint['location'],
+                before=2000,
+                after=10000
+            )
+            # Also check for compound-specific hints
+            compound_hint = iupac_location_hint.get('compound_section_hint', '')
+            if compound_hint and iupac_text:
+                # Search for the specific compound section
+                hint_pattern = re.escape(compound_hint)
+                match = re.search(hint_pattern, iupac_text, re.IGNORECASE)
+                if match:
+                    # Extract more focused text around the compound hint
+                    start = max(0, match.start() - 500)
+                    end = min(len(iupac_text), match.end() + 2000)
+                    iupac_text = iupac_text[start:end]
+                    LOGGER.info("Found compound hint '%s' in IUPAC section", compound_hint)
+            extraction_text = iupac_text or ""
+            if extraction_text:
+                LOGGER.info("Tier 1: Extracted %d chars from IUPAC location hint", len(extraction_text))
+            else:
+                LOGGER.warning("Tier 1: No text found at IUPAC location hint")
+            # Add some manuscript context
+            manuscript_text = "\n\n".join(self.ms_pages[:5])
+        else:
+            # Fallback to standard sections
+            initial_sections = initial_sections or [
+                "General procedure", "Compound characterization",
+                "Synthesis", "Experimental", "Materials and methods"
+            ]
+            # Extract from initial sections - search in all pages (manuscript + SI)
+            extraction_text = self._extract_sections_by_title(initial_sections)
+            # If no sections found by title, include first few SI pages which often have compound data
+            if not extraction_text and self.si_pages:
+                # SI often starts with compound characterization after TOC
+                si_compound_pages = "\n\n".join(self.si_pages[2:10])  # Skip first 2 pages (usually TOC)
+                extraction_text = si_compound_pages
+            # Include manuscript pages (first 10) for model reaction context
+            manuscript_text = "\n\n".join(self.ms_pages[:10])
         # Add campaign context if provided
         campaign_context = ""
@@ -2033,8 +2091,7 @@ Do NOT include compound information from other campaigns.
 """
-        # Extract from initial sections
-        extraction_text = self._extract_sections_by_title(initial_sections)
+        # Combine manuscript text, campaign context, and extraction text
         if extraction_text:
             extraction_text = manuscript_text + campaign_context + "\n\n" + extraction_text
         else:
@@ -2083,11 +2140,11 @@ Do NOT include compound information from other campaigns.
                         figure_images[ref] = img_b64
                         LOGGER.info("Extracted %s for compound mapping", ref)
-            # Full text search including all pages
-            full_text = "\n\n".join(self.all_pages[:40])  # First 40 pages (more comprehensive)
+            # Full text search including ALL pages (manuscript + SI)
+            full_text = "\n\n".join(self.all_pages)  # Send everything
             final_mappings = self._extract_compound_mappings_with_figures(
-                full_text[:60000], missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
+                full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
             )
             # Merge final mappings with better compound ID matching
@@ -2261,7 +2318,13 @@ Do NOT include compound information from other campaigns.
         compound_mappings = {}
         if compound_ids:
             LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
-            compound_mappings = self._extract_compound_mappings_adaptive(compound_ids, campaign_filter=self.campaign_filter)
+            # Pass the IUPAC location hint if we have it
+            iupac_hint = locations.get("iupac_location") if locations else None
+            compound_mappings = self._extract_compound_mappings_adaptive(
+                compound_ids,
+                campaign_filter=self.campaign_filter,
+                iupac_location_hint=iupac_hint
+            )
             # Add the mapped IUPAC names to the context for better extraction
             if compound_mappings:
@@ -2404,6 +2467,34 @@ Different campaigns may use different model reactions and substrates.
                     LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
                                list(compound_mappings.keys()))
+                    # First, populate IUPAC lists directly from compound mappings based on compound_type
+                    substrate_iupacs_from_mappings = []
+                    product_iupacs_from_mappings = []
+                    for mapping in compound_mappings.values():
+                        if mapping.iupac_name and mapping.compound_type:
+                            if mapping.compound_type.lower() == "substrate":
+                                substrate_iupacs_from_mappings.append(mapping.iupac_name)
+                                LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
+                            elif mapping.compound_type.lower() == "product":
+                                product_iupacs_from_mappings.append(mapping.iupac_name)
+                                LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
+                    # Initialize or update the IUPAC lists with mapped compounds
+                    if substrate_iupacs_from_mappings:
+                        existing_substrates = data.get("substrate_iupac_list", []) or []
+                        if isinstance(existing_substrates, list):
+                            data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
+                        else:
+                            data["substrate_iupac_list"] = substrate_iupacs_from_mappings
+                    if product_iupacs_from_mappings:
+                        existing_products = data.get("product_iupac_list", []) or []
+                        if isinstance(existing_products, list):
+                            data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
+                        else:
+                            data["product_iupac_list"] = product_iupacs_from_mappings
                     # Try to map substrate/product lists through compound IDs
                     substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
                     if isinstance(substrate_list, list):
@@ -3021,7 +3112,14 @@ def main() -> None:
                 campaign_filter = all_campaigns[0]
                 LOGGER.info("Detected single campaign: %s", campaign_filter)
-                extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
+                # Create campaign-specific debug directory even for single campaign
+                campaign_debug_dir = None
+                if args.debug_dir:
+                    campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign_filter}"
+                    campaign_debug_dir.mkdir(parents=True, exist_ok=True)
+                    LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
+                extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
                                             campaign_filter=campaign_filter, all_campaigns=all_campaigns)
                 df_metrics = extractor.run(enzyme_df)
@@ -3041,8 +3139,14 @@ def main() -> None:
                         LOGGER.warning("No enzymes found for campaign %s, skipping", campaign)
                         continue
-                    # Create extractor for this campaign
-                    extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
+                    # Create extractor for this campaign with campaign-specific debug directory
+                    campaign_debug_dir = None
+                    if args.debug_dir:
+                        campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign}"
+                        campaign_debug_dir.mkdir(parents=True, exist_ok=True)
+                        LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
+                    extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
                                                 campaign_filter=campaign, all_campaigns=all_campaigns)
                     # Run extraction for this campaign
@@ -3088,7 +3192,13 @@ def main() -> None:
                     df_metrics = pd.DataFrame()
         else:
             # No campaign information, process all enzymes together
-            extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
+            campaign_debug_dir = None
+            if args.debug_dir:
+                campaign_debug_dir = Path(args.debug_dir) / "no_campaign"
+                campaign_debug_dir.mkdir(parents=True, exist_ok=True)
+                LOGGER.info("Debug directory (no campaign): %s", campaign_debug_dir)
+            extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
                                         campaign_filter=campaign_filter, all_campaigns=all_campaigns)
             df_metrics = extractor.run(enzyme_df)

debase/substrate_scope_extractor.py CHANGED Viewed

@@ -28,6 +28,7 @@ import re
 import json
 import time
 import logging
+import subprocess
 from pathlib import Path
 from dataclasses import dataclass, field
 from typing import List, Optional, Dict, Any, Union
@@ -103,6 +104,52 @@ class CompoundMapping:
     compound_type: str = "unknown"
     source_location: Optional[str] = None
+def is_valid_iupac_name_with_opsin(name: str) -> bool:
+    """Check if a name is a valid IUPAC name using the local OPSIN command."""
+    if not name or len(name.strip()) < 3:
+        return False
+    try:
+        # Use local OPSIN command to check if name can be converted to SMILES
+        process = subprocess.run(
+            ['opsin', '-o', 'smi'],
+            input=name.strip(),
+            text=True,
+            capture_output=True,
+            timeout=30
+        )
+        # If OPSIN successfully converts to SMILES, the name is valid IUPAC
+        if process.returncode == 0 and process.stdout.strip():
+            output = process.stdout.strip()
+            # Check if output looks like a valid SMILES (contains common SMILES characters)
+            if any(char in output for char in 'CNOS()=[]#+-'):
+                return True
+        return False
+    except Exception as e:
+        log.debug(f"OPSIN check failed for '{name}': {e}")
+        return False
+def _get_iupac_name(compound) -> str:
+    """Get IUPAC name for a compound, checking if the common name is already IUPAC."""
+    if not compound:
+        return ''
+    # If we already have an IUPAC name, use it
+    if compound.iupac_name:
+        return compound.iupac_name
+    # If no IUPAC name but we have a common name, check if it's already IUPAC
+    if compound.name:
+        # Check with OPSIN if the name is a valid IUPAC name
+        if is_valid_iupac_name_with_opsin(compound.name):
+            log.info(f"'{compound.name}' is already a valid IUPAC name, using it directly")
+            return compound.name
+    return ''
 # === 3. LOGGING HELPERS ===
 # --- Debug dump helper ----------------------------------------------------
@@ -2532,9 +2579,9 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
             'flag': '',
             'substrate_list': '; '.join(s.name for s in entry.substrates if s.name),
-            'substrate_iupac_list': '; '.join(s.iupac_name or '' for s in entry.substrates),
+            'substrate_iupac_list': '; '.join(_get_iupac_name(s) for s in entry.substrates),
             'product_list': '; '.join(p.name for p in entry.products if p.name),
-            'product_iupac_list': '; '.join(p.iupac_name or '' for p in entry.products),
+            'product_iupac_list': '; '.join(_get_iupac_name(p) for p in entry.products),
             'cofactor_list': '; '.join(c.name for c in entry.cofactors if c.name),
             'cofactor_iupac_list': '; '.join(c.iupac_name or '' for c in entry.cofactors),

debase 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

debase 0.4.2py3-none-any.whl → 0.4.4py3-none-any.whl